diff --git a/chrome/change_configuration.sh b/chrome/change_configuration.sh
deleted file mode 100644
index c4fa27b..0000000
--- a/chrome/change_configuration.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-if [ -d "/user_data/news_fetch.profile" ]
-then
- echo "Profile already exists, skipping creation"
-else
- google-chrome &
- sleep 5
- cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
- PID=$(pidof chrome)
- echo "Now killing processes with pid:" $PID
- kill $PID
- cd /user_data/news_fetch.profile
- wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
- unzip master
-fi
-
-google-chrome --user-data-dir=/user_data/news_fetch.profile
\ No newline at end of file
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 69dbae5..8027b99 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -27,21 +27,22 @@ services:
- ${CONTAINER_DATA}/files:/sync/local_files
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
- command:
+ command:
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
- lsyncd
- /sync/nas_sync.config
- chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
- image: selenium/standalone-chrome:latest
+ geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
+ image: selenium/standalone-firefox:latest
shm_size: 2gb
environment:
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
- START_XVFB=${HEADFULL-false}
- SE_VNC_NO_PASSWORD=1
+ # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox"
volumes:
- - ${CONTAINER_DATA}/dependencies:/user_data
+ - ${CONTAINER_DATA}/dependencies:/firefox_profile/
- ${CODE:-/dev/null}:/code
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
expose: ["4444"] # exposed to other docker-compose services only
@@ -60,10 +61,9 @@ services:
news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db
build: news_fetch
image: news_fetch:latest
-
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- nas_sync
- - chrome
+ - geckodriver
- db_passthrough
volumes:
diff --git a/env/debug b/env/debug
index 9811c57..bde24c1 100644
--- a/env/debug
+++ b/env/debug
@@ -2,6 +2,7 @@
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy
+export U_ID=1000
export DEBUG=true
export HEADFULL=true
diff --git a/geckodriver/edit_profile.sh b/geckodriver/edit_profile.sh
new file mode 100644
index 0000000..8477fd2
--- /dev/null
+++ b/geckodriver/edit_profile.sh
@@ -0,0 +1,8 @@
+if [ -d "/firefox_profile/news_fetch.profile" ]
+then
+ echo "Profile already exists, skipping folder creation"
+else
+ echo "Creating empty folder for profile"
+ mkdir -p /firefox_profile/news_fetch.profile/
+fi
+firefox --profile /firefox_profile/news_fetch.profile
\ No newline at end of file
diff --git a/launch b/launch
index 728ad95..1a1797d 100644
--- a/launch
+++ b/launch
@@ -10,43 +10,61 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy
export U_ID=1000
+
+### Main use cases ###
if [[ $1 == "debug" ]]
then
export DEBUG=true
export HEADFULL=true
export CODE=./
export ENTRYPOINT=/bin/bash
- # since service ports does not open ports on implicitly started containers, also start chrome:
- docker compose up -d chrome
+ # since service ports does not open ports on implicitly started containers, also start geckodriver:
+ docker compose up -d geckodriver
+
elif [[ $1 == "production" ]]
then
export DEBUG=false
+
elif [[ $1 == "build" ]]
then
export DEBUG=false
- docker compose build
+ shift
+ docker compose build "$@"
exit 0
+
+
+### Manual Shutdown ###
elif [[ $1 == "down" ]]
then
- docker compose stop
+ docker compose down -t 0
exit 0
-elif [[ $1 == "init" ]]
+
+
+
+### Edge cases -> for firefox ###
+elif [[ $1 == "edit_profile" ]]
then
export CODE=./
export HEADFULL=true
- docker compose up -d chrome
+ docker compose up -d geckodriver
sleep 5
- docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
+ docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container
+ docker compose down -t 0
+
+
+### Fallback ####
else
echo "Please specify the execution mode (debug/production/build) as the first argument"
exit 1
fi
+
+
shift # consumes the variable set in $1 so that $@ only contains the remaining arguments
docker compose run -it --service-ports "$@"
echo "Docker run finished, shutting down containers..."
-docker compose stop
+docker compose down -t 0
echo "Bye!"
diff --git a/misc/sample_config/news_fetch.config.ini b/misc/sample_config/news_fetch.config.ini
index e8de2e9..e16f36d 100644
--- a/misc/sample_config/news_fetch.config.ini
+++ b/misc/sample_config/news_fetch.config.ini
@@ -26,4 +26,6 @@ local_storage_path: /app/containerdata/files
debug_storage_path: /app/containerdata/debug/
default_download_path: /app/containerdata/tmp
remote_storage_path: /helbing_support/Files RM/Archiving
-browser_profile_path: /user_data/news_fetch.profile
+browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
+# please keep this exact name
+browser_print_delay: 5
diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py
index c2304f5..155540a 100644
--- a/misc/youtube_batch.py
+++ b/misc/youtube_batch.py
@@ -4,24 +4,27 @@ import time
urls = [
- "https://www.youtube.com/watch?v=R4h_yiDIuQE",
- "https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
- "https://www.youtube.com/watch?v=8eYBcASQIQI",
- "https://www.thingiverse.com/thing:5463267",
- "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
- "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
- "https://www.youtube.com/watch?v=bQQn_vET4ys",
- "https://www.youtube.com/watch?v=6FqNctiO06E",
- "https://www.youtube.com/watch?v=ImnuJgj8XJo",
- "https://www.youtube.com/watch?v=4QZQtSqaC34",
- "https://www.youtube.com/watch?v=cW4qIjPMGkQ",
- "https://www.youtube.com/watch?v=QWsUGpKfP8A",
- "https://www.youtube.com/watch?v=a0PwEwLG9No",
- "https://www.youtube.com/watch?v=Hd3lnWVIIpo",
- "https://www.youtube.com/watch?v=JNtdAp-BdzI",
- "https://en.wikipedia.org/wiki/Viktor_Schauberger",
- "https://de.wikipedia.org/wiki/Viktor_Schauberger",
+"https://id2020.org",
+"https://www.weforum.org/platforms/the-centre-for-cybersecurity",
+"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf",
+"https://en.wikipedia.org/wiki/Social_Credit_System",
+"https://en.wikipedia.org/wiki/Customer_lifetime_value",
+"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance",
+"https://www.un.org/en/about-us/universal-declaration-of-human-rights",
+"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines",
+"https://www.wired.com/2008/06/pb-theory/",
+"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/",
+"https://www.bbc.com/news/world-middle-east-52579475",
+"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/",
+"https://www.delftdesignforvalues.nl",
+"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
+"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17",
+"https://www.youtube.com/watch?v=_KhAsJRk2lo",
+"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/",
+"https://climatecitycup.org",
+
]
+
def post_download_hook(ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
@@ -45,10 +48,12 @@ def save_video(url):
print(f"Youtube download crashed: {e}")
-# for url in urls:
-# save_video(url)
+# for i, url in enumerate(urls):
+# print(f"Downloading video {i+1} / {len(urls)}")
+ # save_video(url)
-for url in urls:
+for i, url in enumerate(urls):
+ print(f"Saving url {i+1} / {len(urls)}")
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
diff --git a/news_check/client/src/ArticleStatus.svelte b/news_check/client/src/ArticleStatus.svelte
index 97bbd3a..0b6050b 100644
--- a/news_check/client/src/ArticleStatus.svelte
+++ b/news_check/client/src/ArticleStatus.svelte
@@ -34,12 +34,14 @@
{#each status_items as item}
{ item.name } |
- {#if (item.value != "" || status_items.valze == false) }
- {#if item.name == "Url"}
- { item.value } |
- {:else}
- { item.value } |
- {/if}
+ {#if (item.value != "" || status_items.value == false) }
+
+ {#if item.name == "Url"}
+ { item.value }
+ {:else}
+ { item.value }
+ {/if}
+ |
{:else}
not set |
{/if}
diff --git a/news_check/server/app.py b/news_check/server/app.py
index 2a6d324..05afe71 100644
--- a/news_check/server/app.py
+++ b/news_check/server/app.py
@@ -53,11 +53,14 @@ def get_article_next(id):
@app.route("/api/article//set", methods=['POST'])
def set_article(id):
- try:
- action = request.json.get('action', None)
- except Exception as e:
- print(f"Exception in set_article {e}")
+ json = request.get_json(silent=True) # do not raise 400 if there is no json!
+ # no json usually means a file was uploaded
+ if json is None:
+ print("Detected likely file upload.")
action = None
+ else:
+ action = request.json.get('action', None) # action inside the json might still be empty
+
with db:
article = models.ArticleDownload.get_by_id(id)
if action:
@@ -66,7 +69,7 @@ def set_article(id):
elif action == "b":
article.verified = -1
else: # implicitly action == "r":
- print(request.files)
+ # request.files is an immutable dict
file = request.files.get("file", None)
if file is None: # upload tends to crash
return "No file uploaded", 400
@@ -74,7 +77,7 @@ def set_article(id):
artname, _ = os.path.splitext(article.file_name)
fname = f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}"
fpath = os.path.join(article.save_path, fname)
- print(fpath)
+ print(f"Saving file to {fpath}")
file.save(fpath)
article.set_related([fname])
return {"file_path": fpath}
diff --git a/news_fetch/configuration.py b/news_fetch/configuration.py
index 49ba39e..75a13b0 100644
--- a/news_fetch/configuration.py
+++ b/news_fetch/configuration.py
@@ -64,5 +64,5 @@ else:
from utils_storage import models
-# Set up the database
+# Set up the database connection (also creates tables if they don't exist)
models.set_db(download_db)
diff --git a/news_fetch/utils_check/runner.py b/news_fetch/utils_check/runner.py
deleted file mode 100644
index 7d305bf..0000000
--- a/news_fetch/utils_check/runner.py
+++ /dev/null
@@ -1,208 +0,0 @@
-from rich.console import Console
-from rich.table import Table
-from rich.columns import Columns
-from rich.rule import Rule
-console = Console()
-hline = Rule(style="white")
-
-import os
-import subprocess
-from slack_sdk import WebClient
-import configuration
-models = configuration.models
-
-u_options = {
- "ENTER" : "Accept PDF as is. It gets marked as verified",
- "D" : "set languange to DE and set verified",
- "E" : "set languange to EN and set verified",
- "O" : "set other language (prompted)",
- "R" : "set related files (prompted multiple times)",
- "B" : "reject and move to folder BAD",
- "L" : "leave file as is, do not send reaction"
-}
-
-
-bot_client = WebClient(
- token = configuration.main_config["SLACK"]["auth_token"]
-)
-
-
-
-
-
-def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
- """Prints a neat overview of the current article"""
- file_table = Table(
- title = file_url,
- row_styles = ["white", "bright_black"],
- min_width = 100
- )
-
- file_table.add_column("Attribute", justify = "right", no_wrap = True)
- file_table.add_column("Value set by auto_news")
- file_table.add_column("Status", justify = "right")
- for attr in file_attributes:
- file_table.add_row(attr["name"], attr["value"], attr["status"])
-
-
- option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
- option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
- columns = Columns([option_key, option_action])
-
- console.print(file_table)
- console.print("Your options:")
- console.print(columns)
-
-
-def send_reaction_to_slack_thread(article, reaction):
- """Sends the verification status as a reaction to the associated slack thread."""
- thread = article.slack_thread
- messages = models.Message.select().where(models.Message.text.contains(article.article_url))
- # TODO rewrite this shit
- if len(messages) > 5:
- print("Found more than 5 messages. Aborting reactions...")
- return
- for m in messages:
- if m.is_processed_override:
- print("Message already processed. Aborting reactions...")
- elif not m.has_single_url:
- print("Found thread but won't send reaction because thread has multiple urls")
- else:
- ts = m.slack_ts
- bot_client.reactions_add(
- channel=configuration.main_config["SLACK"]["archive_id"],
- name=reaction,
- timestamp=ts
- )
- print("Sent reaction to message")
-
-
-def prompt_language(query):
- not_set = True
- while not_set:
- uin = input("Set language (nation-code, 2 letters) ")
- if len(uin) != 2:
- print("Bad code, try again")
- else:
- not_set = False
- query.language = uin
- query.save()
-
-
-def prompt_related(query):
- file_list = []
- finished = False
- while not finished:
- uin = input("Additional file for article? Type '1' to cancel ")
- if uin == "1":
- query.set_related(file_list)
- finished = True
- else:
- file_list.append(uin)
-
-
-def prompt_new_fname(query):
- uin = input("New fname? ")
- old_fname = query.file_name
- query.file_name = uin
- query.verified = 1
- if old_fname != "":
- os.remove(query.save_path + old_fname)
- query.save()
-
-
-
-def reject_article(article):
- article.verified = -1
- article.save()
- print("Article marked as bad")
- # also update the threads to not be monitored anymore
- send_reaction_to_slack_thread(article, "x")
-
-
-def unreject_article(query):
- query.verified = 1
- query.save()
- # os.rename(badpdf, fname)
- print("File set to verified")
-
-
-def accept_article(article, last_accepted):
- article.verified = 1
- article.save()
- print("Article accepted as GOOD")
-
- # also update the threads to not be monitored anymore
- send_reaction_to_slack_thread(article, "white_check_mark")
-
- return "" # linked
-
-
-
-
-
-
-def verify_unchecked():
- query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
- last_linked = None
-
- for article in query:
- console.print(hline)
- core_info = []
- for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
- entry = {
- "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
- "value" : e if len(e) != 0 else "not set",
- "name" : name
- }
- core_info.append(entry)
-
- try:
- # close any previously opened windows:
- # subprocess.call(["kill", "`pgrep evince`"])
- os.system("pkill evince")
- # then open a new one
- subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- # supress evince gtk warnings
- except Exception as e:
- print(e)
- continue
-
-
-
- file_overview(
- file_url = article.article_url,
- file_attributes=core_info,
- options = u_options
- )
-
-
- proceed = False
- while not proceed:
- proceed = False
- uin = input("Choice ?").lower()
- if uin == "":
- last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
- proceed = True
- elif uin == "d":
- article.language = "de"
- article.verified = 1
- article.save()
- proceed = True
- elif uin == "e":
- article.language = "en"
- article.verified = 1
- article.save()
- proceed = True
- elif uin == "o":
- prompt_language(article)
- elif uin == "r":
- prompt_related(article)
- elif uin == "b":
- reject_article(article)
- proceed = True
- elif uin == "l":
- # do nothing
- proceed = True
- else:
- print("Invalid input")
diff --git a/news_fetch/utils_worker/download/browser.py b/news_fetch/utils_worker/download/browser.py
index 38f95d3..284085f 100644
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@@ -1,70 +1,72 @@
+import logging
import time
import datetime
-import logging
-import os
+
+import os, shutil, uuid
+from pathlib import Path
+
import base64
import requests
from selenium import webdriver
+
import configuration
config = configuration.main_config["DOWNLOADS"]
+def driver_running(f):
+ def wrapper(*args, **kwargs):
+ self = args[0]
+ if not self._running:
+ self.start()
+ return f(*args, **kwargs)
+ return wrapper
+
+
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
- # status-variable for restarting:
- running = False
-
+ _running = False
+
+
def start(self):
- self.finish() # clear up
-
- options = webdriver.ChromeOptions()
- options.add_argument(f"user-data-dir={config['browser_profile_path']}")
- options.add_argument('--headless')
+ """Called externally to start the driver, but after an exception can also be called internally"""
+ if self._running:
+ self.finish() # clear up
- # if os.getenv("DEBUG", "false") == "true":
- # self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
- # else:
+ self.logger.info("Starting geckodriver")
+
+ reduced_path = self.create_tmp_profile()
+ profile = webdriver.FirefoxProfile(reduced_path)
+ options = webdriver.FirefoxOptions()
- # options.set_preference('print.save_as_pdf.links.enabled', True)
- # # Just save if the filetype is pdf already
- # # TODO: this is not working right now
+ if os.getenv("DEBUG", "false") == "true":
+ self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
+ else:
+ options.add_argument('--headless')
- # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
- # options.set_preference("browser.download.folderList", 2)
- # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
- # # options.set_preference("pdfjs.disabled", True)
- # options.set_preference("browser.download.dir", config["default_download_path"])
-
- self.logger.info("Starting chrome driver")
self.driver = webdriver.Remote(
- command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
+ command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
options = options,
- # can't set log path...
+ browser_profile = profile
)
- self.running = True
+ self._running = True
- def autostart(self):
- if not self.running:
- self.start() # relaunch the dl util
def finish(self):
- if self.running:
- self.logger.info("Exiting chrome driver")
- try:
- self.driver.quit()
- time.sleep(10)
- except:
- self.logger.critical("Connection to the driver broke off")
- self.running = False
- else:
- self.logger.info("Chrome driver not yet running")
+ self.logger.info("Exiting Geckodriver")
+ try:
+ self.driver.quit()
+ time.sleep(10)
+ except:
+ self.logger.critical("Connection to the driver broke off")
+ self._running = False
+
+ @driver_running
def download(self, article_object):
- sleep_time = 2
- self.autostart()
+ sleep_time = int(config["browser_print_delay"])
url = article_object.article_url
try:
@@ -89,20 +91,17 @@ class PDFDownloader:
dst = os.path.join(article_object.save_path, fname)
- if url[-4:] == ".pdf":
- # according to the browser preferences, calling the url will open pdfjs.
- # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
+ if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
success = self.get_exisiting_pdf(url, dst)
else:
success = self.get_new_pdf(dst)
-
if success:
article_object.file_name = fname
else:
article_object.file_name = ""
- return article_object # this change is saved later by the external caller
+ return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst):
@@ -134,9 +133,26 @@ class PDFDownloader:
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
-
+ def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
+ reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
+ print(reduced_profile_path, full_profile_path)
+ os.mkdir(reduced_profile_path)
+ # copy needed directories
+ dirs = ["extensions", "storage"]
+ for dir in dirs:
+ shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
+
+ # copy needed files
+ files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
+ for f in files:
+ shutil.copy(full_profile_path / f, reduced_profile_path)
+
+ folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
+ self.logger.info(f"Generated temporary profile with size {folder_size} MB")
+ return reduced_profile_path
+