diff --git a/chrome/change_configuration.sh b/chrome/change_configuration.sh deleted file mode 100644 index c4fa27b..0000000 --- a/chrome/change_configuration.sh +++ /dev/null @@ -1,16 +0,0 @@ -if [ -d "/user_data/news_fetch.profile" ] -then - echo "Profile already exists, skipping creation" -else - google-chrome & - sleep 5 - cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile - PID=$(pidof chrome) - echo "Now killing processes with pid:" $PID - kill $PID - cd /user_data/news_fetch.profile - wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip - unzip master -fi - -google-chrome --user-data-dir=/user_data/news_fetch.profile \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 69dbae5..8027b99 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -27,21 +27,22 @@ services: - ${CONTAINER_DATA}/files:/sync/local_files - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config - command: + command: - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path - lsyncd - /sync/nas_sync.config - chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) - image: selenium/standalone-chrome:latest + geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) + image: selenium/standalone-firefox:latest shm_size: 2gb environment: - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) - START_XVFB=${HEADFULL-false} - SE_VNC_NO_PASSWORD=1 + # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox" volumes: - - ${CONTAINER_DATA}/dependencies:/user_data + - ${CONTAINER_DATA}/dependencies:/firefox_profile/ - ${CODE:-/dev/null}:/code user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user expose: ["4444"] # exposed to other docker-compose services only @@ -60,10 +61,9 @@ services: news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db build: news_fetch image: news_fetch:latest - depends_on: # when using docker compose run news_fetch, the dependencies are started as well - nas_sync - - chrome + - geckodriver - db_passthrough volumes: diff --git a/env/debug b/env/debug index 9811c57..bde24c1 100644 --- a/env/debug +++ b/env/debug @@ -2,6 +2,7 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy +export U_ID=1000 export DEBUG=true export HEADFULL=true diff --git a/geckodriver/edit_profile.sh b/geckodriver/edit_profile.sh new file mode 100644 index 0000000..8477fd2 --- /dev/null +++ b/geckodriver/edit_profile.sh @@ -0,0 +1,8 @@ +if [ -d "/firefox_profile/news_fetch.profile" ] +then + echo "Profile already exists, skipping folder creation" +else + echo "Creating empty folder for profile" + mkdir -p /firefox_profile/news_fetch.profile/ +fi +firefox --profile /firefox_profile/news_fetch.profile \ No newline at end of file diff --git a/launch b/launch index 728ad95..1a1797d 100644 --- a/launch +++ b/launch @@ -10,43 +10,61 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy export U_ID=1000 + +### Main use cases ### if [[ $1 == "debug" ]] then export DEBUG=true export HEADFULL=true export CODE=./ export ENTRYPOINT=/bin/bash - # since service ports does not open ports on implicitly started containers, also start chrome: - docker compose up -d chrome + # since service ports does not open ports on implicitly started containers, also start geckodriver: + docker compose up -d geckodriver + elif [[ $1 == "production" ]] then export DEBUG=false + elif [[ $1 == "build" ]] then export DEBUG=false - docker compose build + shift + docker compose build "$@" exit 0 + + +### Manual Shutdown ### elif [[ $1 == "down" ]] then - docker compose stop + docker compose down -t 0 exit 0 -elif [[ $1 == "init" ]] + + + +### Edge cases -> for firefox ### +elif [[ $1 == "edit_profile" ]] then export CODE=./ export HEADFULL=true - docker compose up -d chrome + docker compose up -d geckodriver sleep 5 - docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh + docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container + docker compose down -t 0 + + +### Fallback #### else echo "Please specify the execution mode (debug/production/build) as the first argument" exit 1 fi + + shift # consumes the variable set in $1 so that $@ only contains the remaining arguments docker compose run -it --service-ports "$@" echo "Docker run finished, shutting down containers..." -docker compose stop +docker compose down -t 0 echo "Bye!" diff --git a/misc/sample_config/news_fetch.config.ini b/misc/sample_config/news_fetch.config.ini index e8de2e9..e16f36d 100644 --- a/misc/sample_config/news_fetch.config.ini +++ b/misc/sample_config/news_fetch.config.ini @@ -26,4 +26,6 @@ local_storage_path: /app/containerdata/files debug_storage_path: /app/containerdata/debug/ default_download_path: /app/containerdata/tmp remote_storage_path: /helbing_support/Files RM/Archiving -browser_profile_path: /user_data/news_fetch.profile +browser_profile_path: /app/containerdata/dependencies/news_fetch.profile +# please keep this exact name +browser_print_delay: 5 diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py index c2304f5..155540a 100644 --- a/misc/youtube_batch.py +++ b/misc/youtube_batch.py @@ -4,24 +4,27 @@ import time urls = [ - "https://www.youtube.com/watch?v=R4h_yiDIuQE", - "https://www.youtube.com/watch?v=-G8ZI1Jq8xA", - "https://www.youtube.com/watch?v=8eYBcASQIQI", - "https://www.thingiverse.com/thing:5463267", - "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s", - "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s", - "https://www.youtube.com/watch?v=bQQn_vET4ys", - "https://www.youtube.com/watch?v=6FqNctiO06E", - "https://www.youtube.com/watch?v=ImnuJgj8XJo", - "https://www.youtube.com/watch?v=4QZQtSqaC34", - "https://www.youtube.com/watch?v=cW4qIjPMGkQ", - "https://www.youtube.com/watch?v=QWsUGpKfP8A", - "https://www.youtube.com/watch?v=a0PwEwLG9No", - "https://www.youtube.com/watch?v=Hd3lnWVIIpo", - "https://www.youtube.com/watch?v=JNtdAp-BdzI", - "https://en.wikipedia.org/wiki/Viktor_Schauberger", - "https://de.wikipedia.org/wiki/Viktor_Schauberger", +"https://id2020.org", +"https://www.weforum.org/platforms/the-centre-for-cybersecurity", +"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf", +"https://en.wikipedia.org/wiki/Social_Credit_System", +"https://en.wikipedia.org/wiki/Customer_lifetime_value", +"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance", +"https://www.un.org/en/about-us/universal-declaration-of-human-rights", +"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines", +"https://www.wired.com/2008/06/pb-theory/", +"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/", +"https://www.bbc.com/news/world-middle-east-52579475", +"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/", +"https://www.delftdesignforvalues.nl", +"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/", +"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17", +"https://www.youtube.com/watch?v=_KhAsJRk2lo", +"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/", +"https://climatecitycup.org", + ] + def post_download_hook(ret_code): # print(ret_code) if ret_code['status'] == 'finished': @@ -45,10 +48,12 @@ def save_video(url): print(f"Youtube download crashed: {e}") -# for url in urls: -# save_video(url) +# for i, url in enumerate(urls): +# print(f"Downloading video {i+1} / {len(urls)}") + # save_video(url) -for url in urls: +for i, url in enumerate(urls): + print(f"Saving url {i+1} / {len(urls)}") user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? wayback = WaybackMachineSaveAPI(url, user_agent) archive_url = wayback.save() diff --git a/news_check/client/src/ArticleStatus.svelte b/news_check/client/src/ArticleStatus.svelte index 97bbd3a..0b6050b 100644 --- a/news_check/client/src/ArticleStatus.svelte +++ b/news_check/client/src/ArticleStatus.svelte @@ -34,12 +34,14 @@ {#each status_items as item} { item.name } - {#if (item.value != "" || status_items.valze == false) } - {#if item.name == "Url"} - { item.value } - {:else} - { item.value } - {/if} + {#if (item.value != "" || status_items.value == false) } + + {#if item.name == "Url"} + { item.value } + {:else} + { item.value } + {/if} + {:else} not set {/if} diff --git a/news_check/server/app.py b/news_check/server/app.py index 2a6d324..05afe71 100644 --- a/news_check/server/app.py +++ b/news_check/server/app.py @@ -53,11 +53,14 @@ def get_article_next(id): @app.route("/api/article//set", methods=['POST']) def set_article(id): - try: - action = request.json.get('action', None) - except Exception as e: - print(f"Exception in set_article {e}") + json = request.get_json(silent=True) # do not raise 400 if there is no json! + # no json usually means a file was uploaded + if json is None: + print("Detected likely file upload.") action = None + else: + action = request.json.get('action', None) # action inside the json might still be empty + with db: article = models.ArticleDownload.get_by_id(id) if action: @@ -66,7 +69,7 @@ def set_article(id): elif action == "b": article.verified = -1 else: # implicitly action == "r": - print(request.files) + # request.files is an immutable dict file = request.files.get("file", None) if file is None: # upload tends to crash return "No file uploaded", 400 @@ -74,7 +77,7 @@ def set_article(id): artname, _ = os.path.splitext(article.file_name) fname = f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}" fpath = os.path.join(article.save_path, fname) - print(fpath) + print(f"Saving file to {fpath}") file.save(fpath) article.set_related([fname]) return {"file_path": fpath} diff --git a/news_fetch/configuration.py b/news_fetch/configuration.py index 49ba39e..75a13b0 100644 --- a/news_fetch/configuration.py +++ b/news_fetch/configuration.py @@ -64,5 +64,5 @@ else: from utils_storage import models -# Set up the database +# Set up the database connection (also creates tables if they don't exist) models.set_db(download_db) diff --git a/news_fetch/utils_check/runner.py b/news_fetch/utils_check/runner.py deleted file mode 100644 index 7d305bf..0000000 --- a/news_fetch/utils_check/runner.py +++ /dev/null @@ -1,208 +0,0 @@ -from rich.console import Console -from rich.table import Table -from rich.columns import Columns -from rich.rule import Rule -console = Console() -hline = Rule(style="white") - -import os -import subprocess -from slack_sdk import WebClient -import configuration -models = configuration.models - -u_options = { - "ENTER" : "Accept PDF as is. It gets marked as verified", - "D" : "set languange to DE and set verified", - "E" : "set languange to EN and set verified", - "O" : "set other language (prompted)", - "R" : "set related files (prompted multiple times)", - "B" : "reject and move to folder BAD", - "L" : "leave file as is, do not send reaction" -} - - -bot_client = WebClient( - token = configuration.main_config["SLACK"]["auth_token"] -) - - - - - -def file_overview(file_url: str, file_attributes: list, options: dict) -> None: - """Prints a neat overview of the current article""" - file_table = Table( - title = file_url, - row_styles = ["white", "bright_black"], - min_width = 100 - ) - - file_table.add_column("Attribute", justify = "right", no_wrap = True) - file_table.add_column("Value set by auto_news") - file_table.add_column("Status", justify = "right") - for attr in file_attributes: - file_table.add_row(attr["name"], attr["value"], attr["status"]) - - - option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()]) - option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()]) - columns = Columns([option_key, option_action]) - - console.print(file_table) - console.print("Your options:") - console.print(columns) - - -def send_reaction_to_slack_thread(article, reaction): - """Sends the verification status as a reaction to the associated slack thread.""" - thread = article.slack_thread - messages = models.Message.select().where(models.Message.text.contains(article.article_url)) - # TODO rewrite this shit - if len(messages) > 5: - print("Found more than 5 messages. Aborting reactions...") - return - for m in messages: - if m.is_processed_override: - print("Message already processed. Aborting reactions...") - elif not m.has_single_url: - print("Found thread but won't send reaction because thread has multiple urls") - else: - ts = m.slack_ts - bot_client.reactions_add( - channel=configuration.main_config["SLACK"]["archive_id"], - name=reaction, - timestamp=ts - ) - print("Sent reaction to message") - - -def prompt_language(query): - not_set = True - while not_set: - uin = input("Set language (nation-code, 2 letters) ") - if len(uin) != 2: - print("Bad code, try again") - else: - not_set = False - query.language = uin - query.save() - - -def prompt_related(query): - file_list = [] - finished = False - while not finished: - uin = input("Additional file for article? Type '1' to cancel ") - if uin == "1": - query.set_related(file_list) - finished = True - else: - file_list.append(uin) - - -def prompt_new_fname(query): - uin = input("New fname? ") - old_fname = query.file_name - query.file_name = uin - query.verified = 1 - if old_fname != "": - os.remove(query.save_path + old_fname) - query.save() - - - -def reject_article(article): - article.verified = -1 - article.save() - print("Article marked as bad") - # also update the threads to not be monitored anymore - send_reaction_to_slack_thread(article, "x") - - -def unreject_article(query): - query.verified = 1 - query.save() - # os.rename(badpdf, fname) - print("File set to verified") - - -def accept_article(article, last_accepted): - article.verified = 1 - article.save() - print("Article accepted as GOOD") - - # also update the threads to not be monitored anymore - send_reaction_to_slack_thread(article, "white_check_mark") - - return "" # linked - - - - - - -def verify_unchecked(): - query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute() - last_linked = None - - for article in query: - console.print(hline) - core_info = [] - for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]): - entry = { - "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]", - "value" : e if len(e) != 0 else "not set", - "name" : name - } - core_info.append(entry) - - try: - # close any previously opened windows: - # subprocess.call(["kill", "`pgrep evince`"]) - os.system("pkill evince") - # then open a new one - subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # supress evince gtk warnings - except Exception as e: - print(e) - continue - - - - file_overview( - file_url = article.article_url, - file_attributes=core_info, - options = u_options - ) - - - proceed = False - while not proceed: - proceed = False - uin = input("Choice ?").lower() - if uin == "": - last_linked = accept_article(article, last_linked) # last linked accelerates the whole process - proceed = True - elif uin == "d": - article.language = "de" - article.verified = 1 - article.save() - proceed = True - elif uin == "e": - article.language = "en" - article.verified = 1 - article.save() - proceed = True - elif uin == "o": - prompt_language(article) - elif uin == "r": - prompt_related(article) - elif uin == "b": - reject_article(article) - proceed = True - elif uin == "l": - # do nothing - proceed = True - else: - print("Invalid input") diff --git a/news_fetch/utils_worker/download/browser.py b/news_fetch/utils_worker/download/browser.py index 38f95d3..284085f 100644 --- a/news_fetch/utils_worker/download/browser.py +++ b/news_fetch/utils_worker/download/browser.py @@ -1,70 +1,72 @@ +import logging import time import datetime -import logging -import os + +import os, shutil, uuid +from pathlib import Path + import base64 import requests from selenium import webdriver + import configuration config = configuration.main_config["DOWNLOADS"] +def driver_running(f): + def wrapper(*args, **kwargs): + self = args[0] + if not self._running: + self.start() + return f(*args, **kwargs) + return wrapper + + class PDFDownloader: """Saves a given url. Fills the object it got as a parameter""" logger = logging.getLogger(__name__) - # status-variable for restarting: - running = False - + _running = False + + def start(self): - self.finish() # clear up - - options = webdriver.ChromeOptions() - options.add_argument(f"user-data-dir={config['browser_profile_path']}") - options.add_argument('--headless') + """Called externally to start the driver, but after an exception can also be called internally""" + if self._running: + self.finish() # clear up - # if os.getenv("DEBUG", "false") == "true": - # self.logger.warning("Opening browser GUI because of 'DEBUG=true'") - # else: + self.logger.info("Starting geckodriver") + + reduced_path = self.create_tmp_profile() + profile = webdriver.FirefoxProfile(reduced_path) + options = webdriver.FirefoxOptions() - # options.set_preference('print.save_as_pdf.links.enabled', True) - # # Just save if the filetype is pdf already - # # TODO: this is not working right now + if os.getenv("DEBUG", "false") == "true": + self.logger.warning("Opening browser GUI because of 'DEBUG=true'") + else: + options.add_argument('--headless') - # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) - # options.set_preference("browser.download.folderList", 2) - # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") - # # options.set_preference("pdfjs.disabled", True) - # options.set_preference("browser.download.dir", config["default_download_path"]) - - self.logger.info("Starting chrome driver") self.driver = webdriver.Remote( - command_executor = 'http://chrome:4444', # the host chrome points to the chrome container + command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container options = options, - # can't set log path... + browser_profile = profile ) - self.running = True + self._running = True - def autostart(self): - if not self.running: - self.start() # relaunch the dl util def finish(self): - if self.running: - self.logger.info("Exiting chrome driver") - try: - self.driver.quit() - time.sleep(10) - except: - self.logger.critical("Connection to the driver broke off") - self.running = False - else: - self.logger.info("Chrome driver not yet running") + self.logger.info("Exiting Geckodriver") + try: + self.driver.quit() + time.sleep(10) + except: + self.logger.critical("Connection to the driver broke off") + self._running = False + + @driver_running def download(self, article_object): - sleep_time = 2 - self.autostart() + sleep_time = int(config["browser_print_delay"]) url = article_object.article_url try: @@ -89,20 +91,17 @@ class PDFDownloader: dst = os.path.join(article_object.save_path, fname) - if url[-4:] == ".pdf": - # according to the browser preferences, calling the url will open pdfjs. - # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least + if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly success = self.get_exisiting_pdf(url, dst) else: success = self.get_new_pdf(dst) - if success: article_object.file_name = fname else: article_object.file_name = "" - return article_object # this change is saved later by the external caller + return article_object # this change is saved later by the external caller def get_exisiting_pdf(self, url, dst): @@ -134,9 +133,26 @@ class PDFDownloader: except Exception as e: self.logger.error(f"Failed, because of FS-operation: {e}") return False - + def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: + reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") + print(reduced_profile_path, full_profile_path) + os.mkdir(reduced_profile_path) + # copy needed directories + dirs = ["extensions", "storage"] + for dir in dirs: + shutil.copytree(full_profile_path / dir, reduced_profile_path / dir) + + # copy needed files + files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"] + for f in files: + shutil.copy(full_profile_path / f, reduced_profile_path) + + folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3) + self.logger.info(f"Generated temporary profile with size {folder_size} MB") + return reduced_profile_path +