Fixed browser profile bug, line breaks and exceptions in news_check

This commit is contained in:
Remy Moll 2022-09-26 15:25:55 +02:00
parent db161e50c8
commit 9349b046d2
12 changed files with 150 additions and 319 deletions

View File

@ -1,16 +0,0 @@
if [ -d "/user_data/news_fetch.profile" ]
then
echo "Profile already exists, skipping creation"
else
google-chrome &
sleep 5
cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
PID=$(pidof chrome)
echo "Now killing processes with pid:" $PID
kill $PID
cd /user_data/news_fetch.profile
wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
unzip master
fi
google-chrome --user-data-dir=/user_data/news_fetch.profile

View File

@ -27,21 +27,22 @@ services:
- ${CONTAINER_DATA}/files:/sync/local_files - ${CONTAINER_DATA}/files:/sync/local_files
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
command: command:
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
- lsyncd - lsyncd
- /sync/nas_sync.config - /sync/nas_sync.config
chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
image: selenium/standalone-chrome:latest image: selenium/standalone-firefox:latest
shm_size: 2gb shm_size: 2gb
environment: environment:
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
- START_XVFB=${HEADFULL-false} - START_XVFB=${HEADFULL-false}
- SE_VNC_NO_PASSWORD=1 - SE_VNC_NO_PASSWORD=1
# - SE_OPTS="--profile /user_data/news_fetch.profile.firefox"
volumes: volumes:
- ${CONTAINER_DATA}/dependencies:/user_data - ${CONTAINER_DATA}/dependencies:/firefox_profile/
- ${CODE:-/dev/null}:/code - ${CODE:-/dev/null}:/code
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
expose: ["4444"] # exposed to other docker-compose services only expose: ["4444"] # exposed to other docker-compose services only
@ -60,10 +61,9 @@ services:
news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db
build: news_fetch build: news_fetch
image: news_fetch:latest image: news_fetch:latest
depends_on: # when using docker compose run news_fetch, the dependencies are started as well depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- nas_sync - nas_sync
- chrome - geckodriver
- db_passthrough - db_passthrough
volumes: volumes:

1
env/debug vendored
View File

@ -2,6 +2,7 @@
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy export UNAME=remy
export U_ID=1000
export DEBUG=true export DEBUG=true
export HEADFULL=true export HEADFULL=true

View File

@ -0,0 +1,8 @@
if [ -d "/firefox_profile/news_fetch.profile" ]
then
echo "Profile already exists, skipping folder creation"
else
echo "Creating empty folder for profile"
mkdir -p /firefox_profile/news_fetch.profile/
fi
firefox --profile /firefox_profile/news_fetch.profile

34
launch
View File

@ -10,43 +10,61 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy export UNAME=remy
export U_ID=1000 export U_ID=1000
### Main use cases ###
if [[ $1 == "debug" ]] if [[ $1 == "debug" ]]
then then
export DEBUG=true export DEBUG=true
export HEADFULL=true export HEADFULL=true
export CODE=./ export CODE=./
export ENTRYPOINT=/bin/bash export ENTRYPOINT=/bin/bash
# since service ports does not open ports on implicitly started containers, also start chrome: # since service ports does not open ports on implicitly started containers, also start geckodriver:
docker compose up -d chrome docker compose up -d geckodriver
elif [[ $1 == "production" ]] elif [[ $1 == "production" ]]
then then
export DEBUG=false export DEBUG=false
elif [[ $1 == "build" ]] elif [[ $1 == "build" ]]
then then
export DEBUG=false export DEBUG=false
docker compose build shift
docker compose build "$@"
exit 0 exit 0
### Manual Shutdown ###
elif [[ $1 == "down" ]] elif [[ $1 == "down" ]]
then then
docker compose stop docker compose down -t 0
exit 0 exit 0
elif [[ $1 == "init" ]]
### Edge cases -> for firefox ###
elif [[ $1 == "edit_profile" ]]
then then
export CODE=./ export CODE=./
export HEADFULL=true export HEADFULL=true
docker compose up -d chrome docker compose up -d geckodriver
sleep 5 sleep 5
docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container
docker compose down -t 0
### Fallback ####
else else
echo "Please specify the execution mode (debug/production/build) as the first argument" echo "Please specify the execution mode (debug/production/build) as the first argument"
exit 1 exit 1
fi fi
shift # consumes the variable set in $1 so that $@ only contains the remaining arguments shift # consumes the variable set in $1 so that $@ only contains the remaining arguments
docker compose run -it --service-ports "$@" docker compose run -it --service-ports "$@"
echo "Docker run finished, shutting down containers..." echo "Docker run finished, shutting down containers..."
docker compose stop docker compose down -t 0
echo "Bye!" echo "Bye!"

View File

@ -26,4 +26,6 @@ local_storage_path: /app/containerdata/files
debug_storage_path: /app/containerdata/debug/ debug_storage_path: /app/containerdata/debug/
default_download_path: /app/containerdata/tmp default_download_path: /app/containerdata/tmp
remote_storage_path: /helbing_support/Files RM/Archiving remote_storage_path: /helbing_support/Files RM/Archiving
browser_profile_path: /user_data/news_fetch.profile browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
# please keep this exact name
browser_print_delay: 5

View File

@ -4,24 +4,27 @@ import time
urls = [ urls = [
"https://www.youtube.com/watch?v=R4h_yiDIuQE", "https://id2020.org",
"https://www.youtube.com/watch?v=-G8ZI1Jq8xA", "https://www.weforum.org/platforms/the-centre-for-cybersecurity",
"https://www.youtube.com/watch?v=8eYBcASQIQI", "https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf",
"https://www.thingiverse.com/thing:5463267", "https://en.wikipedia.org/wiki/Social_Credit_System",
"https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s", "https://en.wikipedia.org/wiki/Customer_lifetime_value",
"https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s", "https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance",
"https://www.youtube.com/watch?v=bQQn_vET4ys", "https://www.un.org/en/about-us/universal-declaration-of-human-rights",
"https://www.youtube.com/watch?v=6FqNctiO06E", "https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines",
"https://www.youtube.com/watch?v=ImnuJgj8XJo", "https://www.wired.com/2008/06/pb-theory/",
"https://www.youtube.com/watch?v=4QZQtSqaC34", "https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/",
"https://www.youtube.com/watch?v=cW4qIjPMGkQ", "https://www.bbc.com/news/world-middle-east-52579475",
"https://www.youtube.com/watch?v=QWsUGpKfP8A", "https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/",
"https://www.youtube.com/watch?v=a0PwEwLG9No", "https://www.delftdesignforvalues.nl",
"https://www.youtube.com/watch?v=Hd3lnWVIIpo", "https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
"https://www.youtube.com/watch?v=JNtdAp-BdzI", "https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17",
"https://en.wikipedia.org/wiki/Viktor_Schauberger", "https://www.youtube.com/watch?v=_KhAsJRk2lo",
"https://de.wikipedia.org/wiki/Viktor_Schauberger", "https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/",
"https://climatecitycup.org",
] ]
def post_download_hook(ret_code): def post_download_hook(ret_code):
# print(ret_code) # print(ret_code)
if ret_code['status'] == 'finished': if ret_code['status'] == 'finished':
@ -45,10 +48,12 @@ def save_video(url):
print(f"Youtube download crashed: {e}") print(f"Youtube download crashed: {e}")
# for url in urls: # for i, url in enumerate(urls):
# save_video(url) # print(f"Downloading video {i+1} / {len(urls)}")
# save_video(url)
for url in urls: for i, url in enumerate(urls):
print(f"Saving url {i+1} / {len(urls)}")
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent) wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save() archive_url = wayback.save()

View File

@ -34,12 +34,14 @@
{#each status_items as item} {#each status_items as item}
<tr> <tr>
<td>{ item.name }</td> <td>{ item.name }</td>
{#if (item.value != "" || status_items.valze == false) } {#if (item.value != "" || status_items.value == false) }
{#if item.name == "Url"} <td class='bg-emerald-200' style="white-space: normal; width:70%">
<td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td> {#if item.name == "Url"}
{:else} <a href="{ item.value }" target="_blank">{ item.value }</a>
<td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td> {:else}
{/if} { item.value }
{/if}
</td>
{:else} {:else}
<td class='bg-red-200'>not set</td> <td class='bg-red-200'>not set</td>
{/if} {/if}

View File

@ -53,11 +53,14 @@ def get_article_next(id):
@app.route("/api/article/<int:id>/set", methods=['POST']) @app.route("/api/article/<int:id>/set", methods=['POST'])
def set_article(id): def set_article(id):
try: json = request.get_json(silent=True) # do not raise 400 if there is no json!
action = request.json.get('action', None) # no json usually means a file was uploaded
except Exception as e: if json is None:
print(f"Exception in set_article {e}") print("Detected likely file upload.")
action = None action = None
else:
action = request.json.get('action', None) # action inside the json might still be empty
with db: with db:
article = models.ArticleDownload.get_by_id(id) article = models.ArticleDownload.get_by_id(id)
if action: if action:
@ -66,7 +69,7 @@ def set_article(id):
elif action == "b": elif action == "b":
article.verified = -1 article.verified = -1
else: # implicitly action == "r": else: # implicitly action == "r":
print(request.files) # request.files is an immutable dict
file = request.files.get("file", None) file = request.files.get("file", None)
if file is None: # upload tends to crash if file is None: # upload tends to crash
return "No file uploaded", 400 return "No file uploaded", 400
@ -74,7 +77,7 @@ def set_article(id):
artname, _ = os.path.splitext(article.file_name) artname, _ = os.path.splitext(article.file_name)
fname = f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}" fname = f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}"
fpath = os.path.join(article.save_path, fname) fpath = os.path.join(article.save_path, fname)
print(fpath) print(f"Saving file to {fpath}")
file.save(fpath) file.save(fpath)
article.set_related([fname]) article.set_related([fname])
return {"file_path": fpath} return {"file_path": fpath}

View File

@ -64,5 +64,5 @@ else:
from utils_storage import models from utils_storage import models
# Set up the database # Set up the database connection (also creates tables if they don't exist)
models.set_db(download_db) models.set_db(download_db)

View File

@ -1,208 +0,0 @@
from rich.console import Console
from rich.table import Table
from rich.columns import Columns
from rich.rule import Rule
console = Console()
hline = Rule(style="white")
import os
import subprocess
from slack_sdk import WebClient
import configuration
models = configuration.models
u_options = {
"ENTER" : "Accept PDF as is. It gets marked as verified",
"D" : "set languange to DE and set verified",
"E" : "set languange to EN and set verified",
"O" : "set other language (prompted)",
"R" : "set related files (prompted multiple times)",
"B" : "reject and move to folder BAD",
"L" : "leave file as is, do not send reaction"
}
bot_client = WebClient(
token = configuration.main_config["SLACK"]["auth_token"]
)
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
"""Prints a neat overview of the current article"""
file_table = Table(
title = file_url,
row_styles = ["white", "bright_black"],
min_width = 100
)
file_table.add_column("Attribute", justify = "right", no_wrap = True)
file_table.add_column("Value set by auto_news")
file_table.add_column("Status", justify = "right")
for attr in file_attributes:
file_table.add_row(attr["name"], attr["value"], attr["status"])
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
columns = Columns([option_key, option_action])
console.print(file_table)
console.print("Your options:")
console.print(columns)
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread."""
thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
if len(messages) > 5:
print("Found more than 5 messages. Aborting reactions...")
return
for m in messages:
if m.is_processed_override:
print("Message already processed. Aborting reactions...")
elif not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls")
else:
ts = m.slack_ts
bot_client.reactions_add(
channel=configuration.main_config["SLACK"]["archive_id"],
name=reaction,
timestamp=ts
)
print("Sent reaction to message")
def prompt_language(query):
not_set = True
while not_set:
uin = input("Set language (nation-code, 2 letters) ")
if len(uin) != 2:
print("Bad code, try again")
else:
not_set = False
query.language = uin
query.save()
def prompt_related(query):
file_list = []
finished = False
while not finished:
uin = input("Additional file for article? Type '1' to cancel ")
if uin == "1":
query.set_related(file_list)
finished = True
else:
file_list.append(uin)
def prompt_new_fname(query):
uin = input("New fname? ")
old_fname = query.file_name
query.file_name = uin
query.verified = 1
if old_fname != "":
os.remove(query.save_path + old_fname)
query.save()
def reject_article(article):
article.verified = -1
article.save()
print("Article marked as bad")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "x")
def unreject_article(query):
query.verified = 1
query.save()
# os.rename(badpdf, fname)
print("File set to verified")
def accept_article(article, last_accepted):
article.verified = 1
article.save()
print("Article accepted as GOOD")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "white_check_mark")
return "" # linked
def verify_unchecked():
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
last_linked = None
for article in query:
console.print(hline)
core_info = []
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
entry = {
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
"value" : e if len(e) != 0 else "not set",
"name" : name
}
core_info.append(entry)
try:
# close any previously opened windows:
# subprocess.call(["kill", "`pgrep evince`"])
os.system("pkill evince")
# then open a new one
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings
except Exception as e:
print(e)
continue
file_overview(
file_url = article.article_url,
file_attributes=core_info,
options = u_options
)
proceed = False
while not proceed:
proceed = False
uin = input("Choice ?").lower()
if uin == "":
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
proceed = True
elif uin == "d":
article.language = "de"
article.verified = 1
article.save()
proceed = True
elif uin == "e":
article.language = "en"
article.verified = 1
article.save()
proceed = True
elif uin == "o":
prompt_language(article)
elif uin == "r":
prompt_related(article)
elif uin == "b":
reject_article(article)
proceed = True
elif uin == "l":
# do nothing
proceed = True
else:
print("Invalid input")

View File

@ -1,70 +1,72 @@
import logging
import time import time
import datetime import datetime
import logging
import os import os, shutil, uuid
from pathlib import Path
import base64 import base64
import requests import requests
from selenium import webdriver from selenium import webdriver
import configuration import configuration
config = configuration.main_config["DOWNLOADS"] config = configuration.main_config["DOWNLOADS"]
def driver_running(f):
def wrapper(*args, **kwargs):
self = args[0]
if not self._running:
self.start()
return f(*args, **kwargs)
return wrapper
class PDFDownloader: class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter""" """Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# status-variable for restarting: _running = False
running = False
def start(self): def start(self):
self.finish() # clear up """Called externally to start the driver, but after an exception can also be called internally"""
if self._running:
options = webdriver.ChromeOptions() self.finish() # clear up
options.add_argument(f"user-data-dir={config['browser_profile_path']}")
options.add_argument('--headless')
# if os.getenv("DEBUG", "false") == "true": self.logger.info("Starting geckodriver")
# self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
# else: reduced_path = self.create_tmp_profile()
profile = webdriver.FirefoxProfile(reduced_path)
options = webdriver.FirefoxOptions()
# options.set_preference('print.save_as_pdf.links.enabled', True) if os.getenv("DEBUG", "false") == "true":
# # Just save if the filetype is pdf already self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
# # TODO: this is not working right now else:
options.add_argument('--headless')
# options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
# options.set_preference("browser.download.folderList", 2)
# # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# # options.set_preference("pdfjs.disabled", True)
# options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting chrome driver")
self.driver = webdriver.Remote( self.driver = webdriver.Remote(
command_executor = 'http://chrome:4444', # the host chrome points to the chrome container command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
options = options, options = options,
# can't set log path... browser_profile = profile
) )
self.running = True self._running = True
def autostart(self):
if not self.running:
self.start() # relaunch the dl util
def finish(self): def finish(self):
if self.running: self.logger.info("Exiting Geckodriver")
self.logger.info("Exiting chrome driver") try:
try: self.driver.quit()
self.driver.quit() time.sleep(10)
time.sleep(10) except:
except: self.logger.critical("Connection to the driver broke off")
self.logger.critical("Connection to the driver broke off") self._running = False
self.running = False
else:
self.logger.info("Chrome driver not yet running")
@driver_running
def download(self, article_object): def download(self, article_object):
sleep_time = 2 sleep_time = int(config["browser_print_delay"])
self.autostart()
url = article_object.article_url url = article_object.article_url
try: try:
@ -89,20 +91,17 @@ class PDFDownloader:
dst = os.path.join(article_object.save_path, fname) dst = os.path.join(article_object.save_path, fname)
if url[-4:] == ".pdf": if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
# according to the browser preferences, calling the url will open pdfjs.
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
success = self.get_exisiting_pdf(url, dst) success = self.get_exisiting_pdf(url, dst)
else: else:
success = self.get_new_pdf(dst) success = self.get_new_pdf(dst)
if success: if success:
article_object.file_name = fname article_object.file_name = fname
else: else:
article_object.file_name = "" article_object.file_name = ""
return article_object # this change is saved later by the external caller return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst): def get_exisiting_pdf(self, url, dst):
@ -134,9 +133,26 @@ class PDFDownloader:
except Exception as e: except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}") self.logger.error(f"Failed, because of FS-operation: {e}")
return False return False
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
print(reduced_profile_path, full_profile_path)
os.mkdir(reduced_profile_path)
# copy needed directories
dirs = ["extensions", "storage"]
for dir in dirs:
shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
# copy needed files
files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
for f in files:
shutil.copy(full_profile_path / f, reduced_profile_path)
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
return reduced_profile_path