Fixed browser profile bug, line breaks and exceptions in news_check
This commit is contained in:
		| @@ -1,16 +0,0 @@ | |||||||
| if [ -d "/user_data/news_fetch.profile" ]  |  | ||||||
| then |  | ||||||
|     echo "Profile already exists, skipping creation" |  | ||||||
| else |  | ||||||
|     google-chrome & |  | ||||||
|     sleep 5 |  | ||||||
|     cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile |  | ||||||
|     PID=$(pidof chrome) |  | ||||||
|     echo "Now killing processes with pid:" $PID |  | ||||||
|     kill $PID |  | ||||||
|     cd /user_data/news_fetch.profile |  | ||||||
|     wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip |  | ||||||
|     unzip master |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| google-chrome --user-data-dir=/user_data/news_fetch.profile |  | ||||||
| @@ -33,15 +33,16 @@ services: | |||||||
|       - /sync/nas_sync.config |       - /sync/nas_sync.config | ||||||
|  |  | ||||||
|  |  | ||||||
|   chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) |   geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) | ||||||
|     image: selenium/standalone-chrome:latest |     image: selenium/standalone-firefox:latest | ||||||
|     shm_size: 2gb |     shm_size: 2gb | ||||||
|     environment: |     environment: | ||||||
|       - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) |       - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) | ||||||
|       - START_XVFB=${HEADFULL-false} |       - START_XVFB=${HEADFULL-false} | ||||||
|       - SE_VNC_NO_PASSWORD=1 |       - SE_VNC_NO_PASSWORD=1 | ||||||
|  |       # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox" | ||||||
|     volumes: |     volumes: | ||||||
|       - ${CONTAINER_DATA}/dependencies:/user_data |       - ${CONTAINER_DATA}/dependencies:/firefox_profile/ | ||||||
|       - ${CODE:-/dev/null}:/code |       - ${CODE:-/dev/null}:/code | ||||||
|     user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user |     user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user | ||||||
|     expose: ["4444"] # exposed to other docker-compose services only |     expose: ["4444"] # exposed to other docker-compose services only | ||||||
| @@ -60,10 +61,9 @@ services: | |||||||
|   news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db |   news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db | ||||||
|     build: news_fetch |     build: news_fetch | ||||||
|     image: news_fetch:latest |     image: news_fetch:latest | ||||||
|  |  | ||||||
|     depends_on: # when using docker compose run news_fetch, the dependencies are started as well |     depends_on: # when using docker compose run news_fetch, the dependencies are started as well | ||||||
|       - nas_sync |       - nas_sync | ||||||
|       - chrome |       - geckodriver | ||||||
|       - db_passthrough |       - db_passthrough | ||||||
|  |  | ||||||
|     volumes: |     volumes: | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								env/debug
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								env/debug
									
									
									
									
										vendored
									
									
								
							| @@ -2,6 +2,7 @@ | |||||||
|  |  | ||||||
| export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving | export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving | ||||||
| export UNAME=remy | export UNAME=remy | ||||||
|  | export U_ID=1000 | ||||||
|  |  | ||||||
| export DEBUG=true | export DEBUG=true | ||||||
| export HEADFULL=true | export HEADFULL=true | ||||||
|   | |||||||
							
								
								
									
										8
									
								
								geckodriver/edit_profile.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								geckodriver/edit_profile.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | |||||||
|  | if [ -d "/firefox_profile/news_fetch.profile" ]  | ||||||
|  | then | ||||||
|  |     echo "Profile already exists, skipping folder creation" | ||||||
|  | else | ||||||
|  |     echo "Creating empty folder for profile" | ||||||
|  |     mkdir -p /firefox_profile/news_fetch.profile/   | ||||||
|  | fi | ||||||
|  | firefox --profile /firefox_profile/news_fetch.profile | ||||||
							
								
								
									
										34
									
								
								launch
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								launch
									
									
									
									
									
								
							| @@ -10,43 +10,61 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving | |||||||
| export UNAME=remy | export UNAME=remy | ||||||
| export U_ID=1000 | export U_ID=1000 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Main use cases ### | ||||||
| if [[ $1 == "debug" ]] | if [[ $1 == "debug" ]] | ||||||
| then | then | ||||||
|     export DEBUG=true |     export DEBUG=true | ||||||
|     export HEADFULL=true |     export HEADFULL=true | ||||||
|     export CODE=./ |     export CODE=./ | ||||||
|     export ENTRYPOINT=/bin/bash |     export ENTRYPOINT=/bin/bash | ||||||
|     # since service ports does not open ports on implicitly started containers, also start chrome: |     # since service ports does not open ports on implicitly started containers, also start geckodriver: | ||||||
|     docker compose up -d chrome |     docker compose up -d geckodriver | ||||||
|  |  | ||||||
| elif [[ $1 == "production" ]] | elif [[ $1 == "production" ]] | ||||||
| then | then | ||||||
|     export DEBUG=false |     export DEBUG=false | ||||||
|  |  | ||||||
| elif [[ $1 == "build" ]] | elif [[ $1 == "build" ]] | ||||||
| then | then | ||||||
|     export DEBUG=false |     export DEBUG=false | ||||||
|     docker compose build |     shift | ||||||
|  |     docker compose build "$@" | ||||||
|     exit 0 |     exit 0 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Manual Shutdown ### | ||||||
| elif [[ $1 == "down" ]] | elif [[ $1 == "down" ]] | ||||||
| then | then | ||||||
|     docker compose stop |     docker compose down -t 0 | ||||||
|     exit 0 |     exit 0 | ||||||
| elif [[ $1 == "init" ]] |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Edge cases -> for firefox ### | ||||||
|  | elif [[ $1 == "edit_profile" ]] | ||||||
| then | then | ||||||
|     export CODE=./ |     export CODE=./ | ||||||
|     export HEADFULL=true |     export HEADFULL=true | ||||||
|  |  | ||||||
|     docker compose up -d chrome |     docker compose up -d geckodriver | ||||||
|     sleep 5 |     sleep 5 | ||||||
|     docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh |     docker compose exec  geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container | ||||||
|  |     docker compose down -t 0 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Fallback #### | ||||||
| else | else | ||||||
|     echo "Please specify the execution mode (debug/production/build) as the first argument" |     echo "Please specify the execution mode (debug/production/build) as the first argument" | ||||||
|     exit 1 |     exit 1 | ||||||
| fi | fi | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| shift # consumes the variable set in $1 so that $@ only contains the remaining arguments | shift # consumes the variable set in $1 so that $@ only contains the remaining arguments | ||||||
|  |  | ||||||
| docker compose run -it --service-ports "$@" | docker compose run -it --service-ports "$@" | ||||||
|  |  | ||||||
| echo "Docker run finished, shutting down containers..." | echo "Docker run finished, shutting down containers..." | ||||||
| docker compose stop | docker compose down -t 0 | ||||||
| echo "Bye!" | echo "Bye!" | ||||||
|   | |||||||
| @@ -26,4 +26,6 @@ local_storage_path: /app/containerdata/files | |||||||
| debug_storage_path: /app/containerdata/debug/ | debug_storage_path: /app/containerdata/debug/ | ||||||
| default_download_path: /app/containerdata/tmp | default_download_path: /app/containerdata/tmp | ||||||
| remote_storage_path: /helbing_support/Files RM/Archiving | remote_storage_path: /helbing_support/Files RM/Archiving | ||||||
| browser_profile_path: /user_data/news_fetch.profile | browser_profile_path: /app/containerdata/dependencies/news_fetch.profile | ||||||
|  | # please keep this exact name | ||||||
|  | browser_print_delay: 5 | ||||||
|   | |||||||
| @@ -4,24 +4,27 @@ import time | |||||||
|  |  | ||||||
|  |  | ||||||
| urls = [ | urls = [ | ||||||
|     "https://www.youtube.com/watch?v=R4h_yiDIuQE", | "https://id2020.org", | ||||||
|     "https://www.youtube.com/watch?v=-G8ZI1Jq8xA", | "https://www.weforum.org/platforms/the-centre-for-cybersecurity", | ||||||
|     "https://www.youtube.com/watch?v=8eYBcASQIQI", | "https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf", | ||||||
|     "https://www.thingiverse.com/thing:5463267", | "https://en.wikipedia.org/wiki/Social_Credit_System", | ||||||
|     "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s", | "https://en.wikipedia.org/wiki/Customer_lifetime_value", | ||||||
|     "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s", | "https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance", | ||||||
|     "https://www.youtube.com/watch?v=bQQn_vET4ys", | "https://www.un.org/en/about-us/universal-declaration-of-human-rights", | ||||||
|     "https://www.youtube.com/watch?v=6FqNctiO06E", | "https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines", | ||||||
|     "https://www.youtube.com/watch?v=ImnuJgj8XJo", | "https://www.wired.com/2008/06/pb-theory/", | ||||||
|     "https://www.youtube.com/watch?v=4QZQtSqaC34", | "https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/", | ||||||
|     "https://www.youtube.com/watch?v=cW4qIjPMGkQ", | "https://www.bbc.com/news/world-middle-east-52579475", | ||||||
|     "https://www.youtube.com/watch?v=QWsUGpKfP8A", | "https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/", | ||||||
|     "https://www.youtube.com/watch?v=a0PwEwLG9No", | "https://www.delftdesignforvalues.nl", | ||||||
|     "https://www.youtube.com/watch?v=Hd3lnWVIIpo", | "https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/", | ||||||
|     "https://www.youtube.com/watch?v=JNtdAp-BdzI", | "https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17", | ||||||
|     "https://en.wikipedia.org/wiki/Viktor_Schauberger", | "https://www.youtube.com/watch?v=_KhAsJRk2lo", | ||||||
|     "https://de.wikipedia.org/wiki/Viktor_Schauberger", | "https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/", | ||||||
|  | "https://climatecitycup.org", | ||||||
|  |  | ||||||
| ] | ] | ||||||
|  |  | ||||||
| def post_download_hook(ret_code): | def post_download_hook(ret_code): | ||||||
|     # print(ret_code) |     # print(ret_code) | ||||||
|     if ret_code['status'] == 'finished': |     if ret_code['status'] == 'finished': | ||||||
| @@ -45,10 +48,12 @@ def save_video(url): | |||||||
|         print(f"Youtube download crashed: {e}") |         print(f"Youtube download crashed: {e}") | ||||||
|  |  | ||||||
|  |  | ||||||
| # for url in urls: | # for i, url in enumerate(urls): | ||||||
| #     save_video(url) | #     print(f"Downloading video {i+1} / {len(urls)}") | ||||||
|  |     # save_video(url) | ||||||
|  |  | ||||||
| for url in urls: | for i, url in enumerate(urls): | ||||||
|  |     print(f"Saving url {i+1} / {len(urls)}") | ||||||
|     user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? |     user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? | ||||||
|     wayback = WaybackMachineSaveAPI(url, user_agent) |     wayback = WaybackMachineSaveAPI(url, user_agent) | ||||||
|     archive_url = wayback.save() |     archive_url = wayback.save() | ||||||
|   | |||||||
| @@ -34,12 +34,14 @@ | |||||||
|             {#each status_items as item} |             {#each status_items as item} | ||||||
|                 <tr> |                 <tr> | ||||||
|                     <td>{ item.name }</td> |                     <td>{ item.name }</td> | ||||||
|                     {#if (item.value != "" || status_items.valze == false) } |                     {#if (item.value != "" || status_items.value == false) } | ||||||
|  |                       <td class='bg-emerald-200' style="white-space: normal; width:70%"> | ||||||
|                         {#if item.name == "Url"} |                         {#if item.name == "Url"} | ||||||
|                         <td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td> |                           <a href="{ item.value }" target="_blank">{ item.value }</a> | ||||||
|                         {:else} |                         {:else} | ||||||
|                         <td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td> |                           { item.value } | ||||||
|                         {/if} |                         {/if} | ||||||
|  |                       </td> | ||||||
|                     {:else} |                     {:else} | ||||||
|                       <td class='bg-red-200'>not set</td> |                       <td class='bg-red-200'>not set</td> | ||||||
|                     {/if} |                     {/if} | ||||||
|   | |||||||
| @@ -53,11 +53,14 @@ def get_article_next(id): | |||||||
|  |  | ||||||
| @app.route("/api/article/<int:id>/set", methods=['POST']) | @app.route("/api/article/<int:id>/set", methods=['POST']) | ||||||
| def set_article(id): | def set_article(id): | ||||||
|     try: |     json = request.get_json(silent=True) # do not raise 400 if there is no json! | ||||||
|         action = request.json.get('action', None) |     # no json usually means a file was uploaded | ||||||
|     except Exception as e: |     if json is None: | ||||||
|         print(f"Exception in set_article {e}") |         print("Detected likely file upload.") | ||||||
|         action = None |         action = None | ||||||
|  |     else: | ||||||
|  |         action = request.json.get('action', None) # action inside the json might still be empty | ||||||
|  |  | ||||||
|     with db: |     with db: | ||||||
|         article = models.ArticleDownload.get_by_id(id) |         article = models.ArticleDownload.get_by_id(id) | ||||||
|         if action: |         if action: | ||||||
| @@ -66,7 +69,7 @@ def set_article(id): | |||||||
|             elif action == "b": |             elif action == "b": | ||||||
|                 article.verified = -1 |                 article.verified = -1 | ||||||
|         else: # implicitly action == "r": |         else: # implicitly action == "r": | ||||||
|             print(request.files) |             # request.files is an immutable dict | ||||||
|             file = request.files.get("file", None) |             file = request.files.get("file", None) | ||||||
|             if file is None: # upload tends to crash |             if file is None: # upload tends to crash | ||||||
|                 return "No file uploaded", 400 |                 return "No file uploaded", 400 | ||||||
| @@ -74,7 +77,7 @@ def set_article(id): | |||||||
|             artname, _ = os.path.splitext(article.file_name) |             artname, _ = os.path.splitext(article.file_name) | ||||||
|             fname =  f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}" |             fname =  f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}" | ||||||
|             fpath = os.path.join(article.save_path, fname) |             fpath = os.path.join(article.save_path, fname) | ||||||
|             print(fpath) |             print(f"Saving file to {fpath}") | ||||||
|             file.save(fpath) |             file.save(fpath) | ||||||
|             article.set_related([fname]) |             article.set_related([fname]) | ||||||
|             return {"file_path": fpath} |             return {"file_path": fpath} | ||||||
|   | |||||||
| @@ -64,5 +64,5 @@ else: | |||||||
|  |  | ||||||
| from utils_storage import models | from utils_storage import models | ||||||
|  |  | ||||||
| # Set up the database | # Set up the database connection (also creates tables if they don't exist) | ||||||
| models.set_db(download_db) | models.set_db(download_db) | ||||||
|   | |||||||
| @@ -1,208 +0,0 @@ | |||||||
| from rich.console import Console |  | ||||||
| from rich.table import Table |  | ||||||
| from rich.columns import Columns |  | ||||||
| from rich.rule import Rule |  | ||||||
| console = Console() |  | ||||||
| hline = Rule(style="white") |  | ||||||
|  |  | ||||||
| import os |  | ||||||
| import subprocess |  | ||||||
| from slack_sdk import WebClient |  | ||||||
| import configuration |  | ||||||
| models = configuration.models |  | ||||||
|  |  | ||||||
| u_options = { |  | ||||||
|     "ENTER" : "Accept PDF as is. It gets marked as verified", |  | ||||||
|     "D" : "set languange to DE and set verified", |  | ||||||
|     "E" : "set languange to EN and set verified", |  | ||||||
|     "O" : "set other language (prompted)", |  | ||||||
|     "R" : "set related files (prompted multiple times)", |  | ||||||
|     "B" : "reject and move to folder BAD", |  | ||||||
|     "L" : "leave file as is, do not send reaction" |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| bot_client = WebClient( |  | ||||||
|     token = configuration.main_config["SLACK"]["auth_token"] |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def file_overview(file_url: str, file_attributes: list, options: dict) -> None: |  | ||||||
|     """Prints a neat overview of the current article""" |  | ||||||
|     file_table = Table( |  | ||||||
|         title = file_url, |  | ||||||
|         row_styles = ["white", "bright_black"], |  | ||||||
|         min_width = 100 |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     file_table.add_column("Attribute", justify = "right", no_wrap = True) |  | ||||||
|     file_table.add_column("Value set by auto_news") |  | ||||||
|     file_table.add_column("Status", justify = "right") |  | ||||||
|     for attr in file_attributes: |  | ||||||
|         file_table.add_row(attr["name"], attr["value"], attr["status"]) |  | ||||||
|  |  | ||||||
|      |  | ||||||
|     option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()]) |  | ||||||
|     option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()]) |  | ||||||
|     columns = Columns([option_key, option_action]) |  | ||||||
|  |  | ||||||
|     console.print(file_table) |  | ||||||
|     console.print("Your options:") |  | ||||||
|     console.print(columns) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def send_reaction_to_slack_thread(article, reaction): |  | ||||||
|     """Sends the verification status as a reaction to the associated slack thread.""" |  | ||||||
|     thread = article.slack_thread |  | ||||||
|     messages = models.Message.select().where(models.Message.text.contains(article.article_url)) |  | ||||||
|     # TODO rewrite this shit |  | ||||||
|     if len(messages) > 5: |  | ||||||
|         print("Found more than 5 messages. Aborting reactions...") |  | ||||||
|         return |  | ||||||
|     for m in messages: |  | ||||||
|         if m.is_processed_override: |  | ||||||
|             print("Message already processed. Aborting reactions...") |  | ||||||
|         elif not m.has_single_url: |  | ||||||
|             print("Found thread but won't send reaction because thread has multiple urls") |  | ||||||
|         else: |  | ||||||
|             ts = m.slack_ts |  | ||||||
|             bot_client.reactions_add( |  | ||||||
|                 channel=configuration.main_config["SLACK"]["archive_id"], |  | ||||||
|                 name=reaction, |  | ||||||
|                 timestamp=ts |  | ||||||
|             ) |  | ||||||
|             print("Sent reaction to message") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def prompt_language(query): |  | ||||||
|     not_set = True |  | ||||||
|     while not_set: |  | ||||||
|         uin = input("Set language (nation-code, 2 letters) ") |  | ||||||
|         if len(uin) != 2: |  | ||||||
|             print("Bad code, try again") |  | ||||||
|         else: |  | ||||||
|             not_set = False |  | ||||||
|             query.language = uin |  | ||||||
|             query.save() |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def prompt_related(query): |  | ||||||
|     file_list = [] |  | ||||||
|     finished = False |  | ||||||
|     while not finished: |  | ||||||
|         uin = input("Additional file for article? Type '1' to cancel ") |  | ||||||
|         if uin == "1": |  | ||||||
|             query.set_related(file_list) |  | ||||||
|             finished = True |  | ||||||
|         else: |  | ||||||
|             file_list.append(uin) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def prompt_new_fname(query): |  | ||||||
|     uin = input("New fname? ") |  | ||||||
|     old_fname =  query.file_name |  | ||||||
|     query.file_name = uin |  | ||||||
|     query.verified = 1 |  | ||||||
|     if old_fname != "": |  | ||||||
|         os.remove(query.save_path + old_fname) |  | ||||||
|     query.save()     |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def reject_article(article): |  | ||||||
|     article.verified = -1 |  | ||||||
|     article.save() |  | ||||||
|     print("Article marked as bad") |  | ||||||
|     # also update the threads to not be monitored anymore |  | ||||||
|     send_reaction_to_slack_thread(article, "x") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def unreject_article(query): |  | ||||||
|     query.verified = 1 |  | ||||||
|     query.save() |  | ||||||
|     # os.rename(badpdf, fname) |  | ||||||
|     print("File set to verified") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def accept_article(article, last_accepted): |  | ||||||
|     article.verified = 1 |  | ||||||
|     article.save() |  | ||||||
|     print("Article accepted as GOOD") |  | ||||||
|  |  | ||||||
|     # also update the threads to not be monitored anymore |  | ||||||
|     send_reaction_to_slack_thread(article, "white_check_mark") |  | ||||||
|  |  | ||||||
|     return "" # linked |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def verify_unchecked(): |  | ||||||
|     query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute() |  | ||||||
|     last_linked = None |  | ||||||
|  |  | ||||||
|     for article in query: |  | ||||||
|         console.print(hline) |  | ||||||
|         core_info = [] |  | ||||||
|         for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]): |  | ||||||
|             entry = { |  | ||||||
|                 "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]", |  | ||||||
|                 "value" : e if len(e) != 0 else "not set", |  | ||||||
|                 "name" : name |  | ||||||
|             } |  | ||||||
|             core_info.append(entry) |  | ||||||
|          |  | ||||||
|         try: |  | ||||||
|             # close any previously opened windows: |  | ||||||
|             # subprocess.call(["kill", "`pgrep evince`"]) |  | ||||||
|             os.system("pkill evince") |  | ||||||
|             # then open a new one |  | ||||||
|             subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) |  | ||||||
|             # supress evince gtk warnings |  | ||||||
|         except Exception as e: |  | ||||||
|             print(e) |  | ||||||
|             continue |  | ||||||
|  |  | ||||||
|          |  | ||||||
|  |  | ||||||
|         file_overview( |  | ||||||
|             file_url = article.article_url,  |  | ||||||
|             file_attributes=core_info, |  | ||||||
|             options = u_options |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|         proceed = False |  | ||||||
|         while not proceed: |  | ||||||
|             proceed = False |  | ||||||
|             uin = input("Choice ?").lower() |  | ||||||
|             if uin == "": |  | ||||||
|                 last_linked = accept_article(article, last_linked) # last linked accelerates the whole process |  | ||||||
|                 proceed = True |  | ||||||
|             elif uin == "d": |  | ||||||
|                 article.language = "de" |  | ||||||
|                 article.verified = 1 |  | ||||||
|                 article.save() |  | ||||||
|                 proceed = True |  | ||||||
|             elif uin == "e": |  | ||||||
|                 article.language = "en" |  | ||||||
|                 article.verified = 1 |  | ||||||
|                 article.save() |  | ||||||
|                 proceed = True |  | ||||||
|             elif uin == "o": |  | ||||||
|                 prompt_language(article) |  | ||||||
|             elif uin == "r": |  | ||||||
|                 prompt_related(article) |  | ||||||
|             elif uin == "b": |  | ||||||
|                 reject_article(article) |  | ||||||
|                 proceed = True |  | ||||||
|             elif uin == "l": |  | ||||||
|                 # do nothing |  | ||||||
|                 proceed = True |  | ||||||
|             else: |  | ||||||
|                 print("Invalid input") |  | ||||||
| @@ -1,70 +1,72 @@ | |||||||
|  | import logging | ||||||
| import time | import time | ||||||
| import datetime | import datetime | ||||||
| import logging |  | ||||||
| import os | import os, shutil, uuid | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
| import base64 | import base64 | ||||||
| import requests | import requests | ||||||
| from selenium import webdriver | from selenium import webdriver | ||||||
|  |  | ||||||
| import configuration | import configuration | ||||||
|  |  | ||||||
| config = configuration.main_config["DOWNLOADS"] | config = configuration.main_config["DOWNLOADS"] | ||||||
|  |  | ||||||
|  | def driver_running(f): | ||||||
|  |     def wrapper(*args, **kwargs): | ||||||
|  |         self = args[0] | ||||||
|  |         if not self._running: | ||||||
|  |             self.start() | ||||||
|  |         return f(*args, **kwargs) | ||||||
|  |     return wrapper | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class PDFDownloader: | class PDFDownloader: | ||||||
|     """Saves a given url. Fills the object it got as a parameter""" |     """Saves a given url. Fills the object it got as a parameter""" | ||||||
|     logger = logging.getLogger(__name__) |     logger = logging.getLogger(__name__) | ||||||
|     # status-variable for restarting: |     _running = False | ||||||
|     running = False |  | ||||||
|  |  | ||||||
|     def start(self): |     def start(self): | ||||||
|  |         """Called externally to start the driver, but after an exception can also be called internally""" | ||||||
|  |         if self._running: | ||||||
|             self.finish() # clear up |             self.finish() # clear up | ||||||
|  |  | ||||||
|         options = webdriver.ChromeOptions() |         self.logger.info("Starting geckodriver") | ||||||
|         options.add_argument(f"user-data-dir={config['browser_profile_path']}") |          | ||||||
|  |         reduced_path = self.create_tmp_profile() | ||||||
|  |         profile = webdriver.FirefoxProfile(reduced_path) | ||||||
|  |         options = webdriver.FirefoxOptions() | ||||||
|  |  | ||||||
|  |         if os.getenv("DEBUG", "false") == "true": | ||||||
|  |             self.logger.warning("Opening browser GUI because of 'DEBUG=true'") | ||||||
|  |         else: | ||||||
|             options.add_argument('--headless') |             options.add_argument('--headless') | ||||||
|  |  | ||||||
|         # if os.getenv("DEBUG", "false") == "true": |  | ||||||
|         #     self.logger.warning("Opening browser GUI because of 'DEBUG=true'") |  | ||||||
|         # else: |  | ||||||
|  |  | ||||||
|         # options.set_preference('print.save_as_pdf.links.enabled', True) |  | ||||||
|         # # Just save if the filetype is pdf already |  | ||||||
|         # # TODO: this is not working right now |  | ||||||
|  |  | ||||||
|         # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) |  | ||||||
|         # options.set_preference("browser.download.folderList", 2) |  | ||||||
|         # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") |  | ||||||
|         # # options.set_preference("pdfjs.disabled", True) |  | ||||||
|         # options.set_preference("browser.download.dir", config["default_download_path"]) |  | ||||||
|  |  | ||||||
|         self.logger.info("Starting chrome driver") |  | ||||||
|         self.driver = webdriver.Remote( |         self.driver = webdriver.Remote( | ||||||
|             command_executor = 'http://chrome:4444', # the host chrome points to the chrome container |             command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container | ||||||
|             options = options, |             options = options, | ||||||
|             # can't set log path... |             browser_profile = profile | ||||||
|         ) |         ) | ||||||
|          |          | ||||||
|         self.running = True |         self._running = True | ||||||
|  |  | ||||||
|     def autostart(self): |  | ||||||
|         if not self.running: |  | ||||||
|             self.start()  # relaunch the dl util |  | ||||||
|  |  | ||||||
|     def finish(self): |     def finish(self): | ||||||
|         if self.running: |         self.logger.info("Exiting Geckodriver") | ||||||
|             self.logger.info("Exiting chrome driver") |  | ||||||
|         try: |         try: | ||||||
|             self.driver.quit() |             self.driver.quit() | ||||||
|             time.sleep(10) |             time.sleep(10) | ||||||
|         except: |         except: | ||||||
|             self.logger.critical("Connection to the driver broke off") |             self.logger.critical("Connection to the driver broke off") | ||||||
|             self.running = False |         self._running = False | ||||||
|         else: |  | ||||||
|             self.logger.info("Chrome driver not yet running") |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @driver_running | ||||||
|     def download(self, article_object): |     def download(self, article_object): | ||||||
|         sleep_time = 2 |         sleep_time = int(config["browser_print_delay"]) | ||||||
|         self.autostart() |  | ||||||
|         url = article_object.article_url |         url = article_object.article_url | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
| @@ -89,14 +91,11 @@ class PDFDownloader: | |||||||
|             dst = os.path.join(article_object.save_path, fname) |             dst = os.path.join(article_object.save_path, fname) | ||||||
|  |  | ||||||
|  |  | ||||||
|         if url[-4:] == ".pdf": |         if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly | ||||||
|             # according to the browser preferences, calling the url will open pdfjs. |  | ||||||
|             # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least |  | ||||||
|             success = self.get_exisiting_pdf(url, dst) |             success = self.get_exisiting_pdf(url, dst) | ||||||
|         else: |         else: | ||||||
|             success = self.get_new_pdf(dst) |             success = self.get_new_pdf(dst) | ||||||
|  |  | ||||||
|  |  | ||||||
|         if success: |         if success: | ||||||
|             article_object.file_name = fname |             article_object.file_name = fname | ||||||
|         else: |         else: | ||||||
| @@ -136,6 +135,23 @@ class PDFDownloader: | |||||||
|             return False |             return False | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: | ||||||
|  |         reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") | ||||||
|  |         print(reduced_profile_path, full_profile_path) | ||||||
|  |         os.mkdir(reduced_profile_path) | ||||||
|  |         # copy needed directories | ||||||
|  |         dirs = ["extensions", "storage"] | ||||||
|  |         for dir in dirs: | ||||||
|  |             shutil.copytree(full_profile_path / dir, reduced_profile_path / dir) | ||||||
|  |  | ||||||
|  |         # copy needed files | ||||||
|  |         files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"] | ||||||
|  |         for f in files: | ||||||
|  |             shutil.copy(full_profile_path / f, reduced_profile_path) | ||||||
|  |          | ||||||
|  |         folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3) | ||||||
|  |         self.logger.info(f"Generated temporary profile with size {folder_size} MB") | ||||||
|  |         return reduced_profile_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user