Fixed browser profile bug, line breaks and exceptions in news_check
This commit is contained in:
parent
db161e50c8
commit
9349b046d2
@ -1,16 +0,0 @@
|
||||
if [ -d "/user_data/news_fetch.profile" ]
|
||||
then
|
||||
echo "Profile already exists, skipping creation"
|
||||
else
|
||||
google-chrome &
|
||||
sleep 5
|
||||
cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
|
||||
PID=$(pidof chrome)
|
||||
echo "Now killing processes with pid:" $PID
|
||||
kill $PID
|
||||
cd /user_data/news_fetch.profile
|
||||
wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
|
||||
unzip master
|
||||
fi
|
||||
|
||||
google-chrome --user-data-dir=/user_data/news_fetch.profile
|
@ -27,21 +27,22 @@ services:
|
||||
- ${CONTAINER_DATA}/files:/sync/local_files
|
||||
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
|
||||
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
|
||||
command:
|
||||
command:
|
||||
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
|
||||
- lsyncd
|
||||
- /sync/nas_sync.config
|
||||
|
||||
|
||||
chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
||||
image: selenium/standalone-chrome:latest
|
||||
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
||||
image: selenium/standalone-firefox:latest
|
||||
shm_size: 2gb
|
||||
environment:
|
||||
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
||||
- START_XVFB=${HEADFULL-false}
|
||||
- SE_VNC_NO_PASSWORD=1
|
||||
# - SE_OPTS="--profile /user_data/news_fetch.profile.firefox"
|
||||
volumes:
|
||||
- ${CONTAINER_DATA}/dependencies:/user_data
|
||||
- ${CONTAINER_DATA}/dependencies:/firefox_profile/
|
||||
- ${CODE:-/dev/null}:/code
|
||||
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
|
||||
expose: ["4444"] # exposed to other docker-compose services only
|
||||
@ -60,10 +61,9 @@ services:
|
||||
news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db
|
||||
build: news_fetch
|
||||
image: news_fetch:latest
|
||||
|
||||
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
|
||||
- nas_sync
|
||||
- chrome
|
||||
- geckodriver
|
||||
- db_passthrough
|
||||
|
||||
volumes:
|
||||
|
1
env/debug
vendored
1
env/debug
vendored
@ -2,6 +2,7 @@
|
||||
|
||||
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
||||
export UNAME=remy
|
||||
export U_ID=1000
|
||||
|
||||
export DEBUG=true
|
||||
export HEADFULL=true
|
||||
|
8
geckodriver/edit_profile.sh
Normal file
8
geckodriver/edit_profile.sh
Normal file
@ -0,0 +1,8 @@
|
||||
if [ -d "/firefox_profile/news_fetch.profile" ]
|
||||
then
|
||||
echo "Profile already exists, skipping folder creation"
|
||||
else
|
||||
echo "Creating empty folder for profile"
|
||||
mkdir -p /firefox_profile/news_fetch.profile/
|
||||
fi
|
||||
firefox --profile /firefox_profile/news_fetch.profile
|
34
launch
34
launch
@ -10,43 +10,61 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
||||
export UNAME=remy
|
||||
export U_ID=1000
|
||||
|
||||
|
||||
### Main use cases ###
|
||||
if [[ $1 == "debug" ]]
|
||||
then
|
||||
export DEBUG=true
|
||||
export HEADFULL=true
|
||||
export CODE=./
|
||||
export ENTRYPOINT=/bin/bash
|
||||
# since service ports does not open ports on implicitly started containers, also start chrome:
|
||||
docker compose up -d chrome
|
||||
# since service ports does not open ports on implicitly started containers, also start geckodriver:
|
||||
docker compose up -d geckodriver
|
||||
|
||||
elif [[ $1 == "production" ]]
|
||||
then
|
||||
export DEBUG=false
|
||||
|
||||
elif [[ $1 == "build" ]]
|
||||
then
|
||||
export DEBUG=false
|
||||
docker compose build
|
||||
shift
|
||||
docker compose build "$@"
|
||||
exit 0
|
||||
|
||||
|
||||
### Manual Shutdown ###
|
||||
elif [[ $1 == "down" ]]
|
||||
then
|
||||
docker compose stop
|
||||
docker compose down -t 0
|
||||
exit 0
|
||||
elif [[ $1 == "init" ]]
|
||||
|
||||
|
||||
|
||||
### Edge cases -> for firefox ###
|
||||
elif [[ $1 == "edit_profile" ]]
|
||||
then
|
||||
export CODE=./
|
||||
export HEADFULL=true
|
||||
|
||||
docker compose up -d chrome
|
||||
docker compose up -d geckodriver
|
||||
sleep 5
|
||||
docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
|
||||
docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container
|
||||
docker compose down -t 0
|
||||
|
||||
|
||||
### Fallback ####
|
||||
else
|
||||
echo "Please specify the execution mode (debug/production/build) as the first argument"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
shift # consumes the variable set in $1 so that $@ only contains the remaining arguments
|
||||
|
||||
docker compose run -it --service-ports "$@"
|
||||
|
||||
echo "Docker run finished, shutting down containers..."
|
||||
docker compose stop
|
||||
docker compose down -t 0
|
||||
echo "Bye!"
|
||||
|
@ -26,4 +26,6 @@ local_storage_path: /app/containerdata/files
|
||||
debug_storage_path: /app/containerdata/debug/
|
||||
default_download_path: /app/containerdata/tmp
|
||||
remote_storage_path: /helbing_support/Files RM/Archiving
|
||||
browser_profile_path: /user_data/news_fetch.profile
|
||||
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
|
||||
# please keep this exact name
|
||||
browser_print_delay: 5
|
||||
|
@ -4,24 +4,27 @@ import time
|
||||
|
||||
|
||||
urls = [
|
||||
"https://www.youtube.com/watch?v=R4h_yiDIuQE",
|
||||
"https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
|
||||
"https://www.youtube.com/watch?v=8eYBcASQIQI",
|
||||
"https://www.thingiverse.com/thing:5463267",
|
||||
"https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
|
||||
"https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
|
||||
"https://www.youtube.com/watch?v=bQQn_vET4ys",
|
||||
"https://www.youtube.com/watch?v=6FqNctiO06E",
|
||||
"https://www.youtube.com/watch?v=ImnuJgj8XJo",
|
||||
"https://www.youtube.com/watch?v=4QZQtSqaC34",
|
||||
"https://www.youtube.com/watch?v=cW4qIjPMGkQ",
|
||||
"https://www.youtube.com/watch?v=QWsUGpKfP8A",
|
||||
"https://www.youtube.com/watch?v=a0PwEwLG9No",
|
||||
"https://www.youtube.com/watch?v=Hd3lnWVIIpo",
|
||||
"https://www.youtube.com/watch?v=JNtdAp-BdzI",
|
||||
"https://en.wikipedia.org/wiki/Viktor_Schauberger",
|
||||
"https://de.wikipedia.org/wiki/Viktor_Schauberger",
|
||||
"https://id2020.org",
|
||||
"https://www.weforum.org/platforms/the-centre-for-cybersecurity",
|
||||
"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf",
|
||||
"https://en.wikipedia.org/wiki/Social_Credit_System",
|
||||
"https://en.wikipedia.org/wiki/Customer_lifetime_value",
|
||||
"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance",
|
||||
"https://www.un.org/en/about-us/universal-declaration-of-human-rights",
|
||||
"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines",
|
||||
"https://www.wired.com/2008/06/pb-theory/",
|
||||
"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/",
|
||||
"https://www.bbc.com/news/world-middle-east-52579475",
|
||||
"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/",
|
||||
"https://www.delftdesignforvalues.nl",
|
||||
"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
|
||||
"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17",
|
||||
"https://www.youtube.com/watch?v=_KhAsJRk2lo",
|
||||
"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/",
|
||||
"https://climatecitycup.org",
|
||||
|
||||
]
|
||||
|
||||
def post_download_hook(ret_code):
|
||||
# print(ret_code)
|
||||
if ret_code['status'] == 'finished':
|
||||
@ -45,10 +48,12 @@ def save_video(url):
|
||||
print(f"Youtube download crashed: {e}")
|
||||
|
||||
|
||||
# for url in urls:
|
||||
# save_video(url)
|
||||
# for i, url in enumerate(urls):
|
||||
# print(f"Downloading video {i+1} / {len(urls)}")
|
||||
# save_video(url)
|
||||
|
||||
for url in urls:
|
||||
for i, url in enumerate(urls):
|
||||
print(f"Saving url {i+1} / {len(urls)}")
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
|
||||
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||
archive_url = wayback.save()
|
||||
|
@ -34,12 +34,14 @@
|
||||
{#each status_items as item}
|
||||
<tr>
|
||||
<td>{ item.name }</td>
|
||||
{#if (item.value != "" || status_items.valze == false) }
|
||||
{#if item.name == "Url"}
|
||||
<td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td>
|
||||
{:else}
|
||||
<td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
|
||||
{/if}
|
||||
{#if (item.value != "" || status_items.value == false) }
|
||||
<td class='bg-emerald-200' style="white-space: normal; width:70%">
|
||||
{#if item.name == "Url"}
|
||||
<a href="{ item.value }" target="_blank">{ item.value }</a>
|
||||
{:else}
|
||||
{ item.value }
|
||||
{/if}
|
||||
</td>
|
||||
{:else}
|
||||
<td class='bg-red-200'>not set</td>
|
||||
{/if}
|
||||
|
@ -53,11 +53,14 @@ def get_article_next(id):
|
||||
|
||||
@app.route("/api/article/<int:id>/set", methods=['POST'])
|
||||
def set_article(id):
|
||||
try:
|
||||
action = request.json.get('action', None)
|
||||
except Exception as e:
|
||||
print(f"Exception in set_article {e}")
|
||||
json = request.get_json(silent=True) # do not raise 400 if there is no json!
|
||||
# no json usually means a file was uploaded
|
||||
if json is None:
|
||||
print("Detected likely file upload.")
|
||||
action = None
|
||||
else:
|
||||
action = request.json.get('action', None) # action inside the json might still be empty
|
||||
|
||||
with db:
|
||||
article = models.ArticleDownload.get_by_id(id)
|
||||
if action:
|
||||
@ -66,7 +69,7 @@ def set_article(id):
|
||||
elif action == "b":
|
||||
article.verified = -1
|
||||
else: # implicitly action == "r":
|
||||
print(request.files)
|
||||
# request.files is an immutable dict
|
||||
file = request.files.get("file", None)
|
||||
if file is None: # upload tends to crash
|
||||
return "No file uploaded", 400
|
||||
@ -74,7 +77,7 @@ def set_article(id):
|
||||
artname, _ = os.path.splitext(article.file_name)
|
||||
fname = f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}"
|
||||
fpath = os.path.join(article.save_path, fname)
|
||||
print(fpath)
|
||||
print(f"Saving file to {fpath}")
|
||||
file.save(fpath)
|
||||
article.set_related([fname])
|
||||
return {"file_path": fpath}
|
||||
|
@ -64,5 +64,5 @@ else:
|
||||
|
||||
from utils_storage import models
|
||||
|
||||
# Set up the database
|
||||
# Set up the database connection (also creates tables if they don't exist)
|
||||
models.set_db(download_db)
|
||||
|
@ -1,208 +0,0 @@
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.columns import Columns
|
||||
from rich.rule import Rule
|
||||
console = Console()
|
||||
hline = Rule(style="white")
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from slack_sdk import WebClient
|
||||
import configuration
|
||||
models = configuration.models
|
||||
|
||||
u_options = {
|
||||
"ENTER" : "Accept PDF as is. It gets marked as verified",
|
||||
"D" : "set languange to DE and set verified",
|
||||
"E" : "set languange to EN and set verified",
|
||||
"O" : "set other language (prompted)",
|
||||
"R" : "set related files (prompted multiple times)",
|
||||
"B" : "reject and move to folder BAD",
|
||||
"L" : "leave file as is, do not send reaction"
|
||||
}
|
||||
|
||||
|
||||
bot_client = WebClient(
|
||||
token = configuration.main_config["SLACK"]["auth_token"]
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
|
||||
"""Prints a neat overview of the current article"""
|
||||
file_table = Table(
|
||||
title = file_url,
|
||||
row_styles = ["white", "bright_black"],
|
||||
min_width = 100
|
||||
)
|
||||
|
||||
file_table.add_column("Attribute", justify = "right", no_wrap = True)
|
||||
file_table.add_column("Value set by auto_news")
|
||||
file_table.add_column("Status", justify = "right")
|
||||
for attr in file_attributes:
|
||||
file_table.add_row(attr["name"], attr["value"], attr["status"])
|
||||
|
||||
|
||||
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
|
||||
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
|
||||
columns = Columns([option_key, option_action])
|
||||
|
||||
console.print(file_table)
|
||||
console.print("Your options:")
|
||||
console.print(columns)
|
||||
|
||||
|
||||
def send_reaction_to_slack_thread(article, reaction):
|
||||
"""Sends the verification status as a reaction to the associated slack thread."""
|
||||
thread = article.slack_thread
|
||||
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
|
||||
# TODO rewrite this shit
|
||||
if len(messages) > 5:
|
||||
print("Found more than 5 messages. Aborting reactions...")
|
||||
return
|
||||
for m in messages:
|
||||
if m.is_processed_override:
|
||||
print("Message already processed. Aborting reactions...")
|
||||
elif not m.has_single_url:
|
||||
print("Found thread but won't send reaction because thread has multiple urls")
|
||||
else:
|
||||
ts = m.slack_ts
|
||||
bot_client.reactions_add(
|
||||
channel=configuration.main_config["SLACK"]["archive_id"],
|
||||
name=reaction,
|
||||
timestamp=ts
|
||||
)
|
||||
print("Sent reaction to message")
|
||||
|
||||
|
||||
def prompt_language(query):
|
||||
not_set = True
|
||||
while not_set:
|
||||
uin = input("Set language (nation-code, 2 letters) ")
|
||||
if len(uin) != 2:
|
||||
print("Bad code, try again")
|
||||
else:
|
||||
not_set = False
|
||||
query.language = uin
|
||||
query.save()
|
||||
|
||||
|
||||
def prompt_related(query):
|
||||
file_list = []
|
||||
finished = False
|
||||
while not finished:
|
||||
uin = input("Additional file for article? Type '1' to cancel ")
|
||||
if uin == "1":
|
||||
query.set_related(file_list)
|
||||
finished = True
|
||||
else:
|
||||
file_list.append(uin)
|
||||
|
||||
|
||||
def prompt_new_fname(query):
|
||||
uin = input("New fname? ")
|
||||
old_fname = query.file_name
|
||||
query.file_name = uin
|
||||
query.verified = 1
|
||||
if old_fname != "":
|
||||
os.remove(query.save_path + old_fname)
|
||||
query.save()
|
||||
|
||||
|
||||
|
||||
def reject_article(article):
|
||||
article.verified = -1
|
||||
article.save()
|
||||
print("Article marked as bad")
|
||||
# also update the threads to not be monitored anymore
|
||||
send_reaction_to_slack_thread(article, "x")
|
||||
|
||||
|
||||
def unreject_article(query):
|
||||
query.verified = 1
|
||||
query.save()
|
||||
# os.rename(badpdf, fname)
|
||||
print("File set to verified")
|
||||
|
||||
|
||||
def accept_article(article, last_accepted):
|
||||
article.verified = 1
|
||||
article.save()
|
||||
print("Article accepted as GOOD")
|
||||
|
||||
# also update the threads to not be monitored anymore
|
||||
send_reaction_to_slack_thread(article, "white_check_mark")
|
||||
|
||||
return "" # linked
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def verify_unchecked():
|
||||
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
|
||||
last_linked = None
|
||||
|
||||
for article in query:
|
||||
console.print(hline)
|
||||
core_info = []
|
||||
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
|
||||
entry = {
|
||||
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
|
||||
"value" : e if len(e) != 0 else "not set",
|
||||
"name" : name
|
||||
}
|
||||
core_info.append(entry)
|
||||
|
||||
try:
|
||||
# close any previously opened windows:
|
||||
# subprocess.call(["kill", "`pgrep evince`"])
|
||||
os.system("pkill evince")
|
||||
# then open a new one
|
||||
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
# supress evince gtk warnings
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
|
||||
|
||||
file_overview(
|
||||
file_url = article.article_url,
|
||||
file_attributes=core_info,
|
||||
options = u_options
|
||||
)
|
||||
|
||||
|
||||
proceed = False
|
||||
while not proceed:
|
||||
proceed = False
|
||||
uin = input("Choice ?").lower()
|
||||
if uin == "":
|
||||
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
|
||||
proceed = True
|
||||
elif uin == "d":
|
||||
article.language = "de"
|
||||
article.verified = 1
|
||||
article.save()
|
||||
proceed = True
|
||||
elif uin == "e":
|
||||
article.language = "en"
|
||||
article.verified = 1
|
||||
article.save()
|
||||
proceed = True
|
||||
elif uin == "o":
|
||||
prompt_language(article)
|
||||
elif uin == "r":
|
||||
prompt_related(article)
|
||||
elif uin == "b":
|
||||
reject_article(article)
|
||||
proceed = True
|
||||
elif uin == "l":
|
||||
# do nothing
|
||||
proceed = True
|
||||
else:
|
||||
print("Invalid input")
|
@ -1,70 +1,72 @@
|
||||
import logging
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
|
||||
import os, shutil, uuid
|
||||
from pathlib import Path
|
||||
|
||||
import base64
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
|
||||
import configuration
|
||||
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
|
||||
def driver_running(f):
|
||||
def wrapper(*args, **kwargs):
|
||||
self = args[0]
|
||||
if not self._running:
|
||||
self.start()
|
||||
return f(*args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
"""Saves a given url. Fills the object it got as a parameter"""
|
||||
logger = logging.getLogger(__name__)
|
||||
# status-variable for restarting:
|
||||
running = False
|
||||
|
||||
_running = False
|
||||
|
||||
|
||||
def start(self):
|
||||
self.finish() # clear up
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument(f"user-data-dir={config['browser_profile_path']}")
|
||||
options.add_argument('--headless')
|
||||
"""Called externally to start the driver, but after an exception can also be called internally"""
|
||||
if self._running:
|
||||
self.finish() # clear up
|
||||
|
||||
# if os.getenv("DEBUG", "false") == "true":
|
||||
# self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
||||
# else:
|
||||
self.logger.info("Starting geckodriver")
|
||||
|
||||
reduced_path = self.create_tmp_profile()
|
||||
profile = webdriver.FirefoxProfile(reduced_path)
|
||||
options = webdriver.FirefoxOptions()
|
||||
|
||||
# options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||
# # Just save if the filetype is pdf already
|
||||
# # TODO: this is not working right now
|
||||
if os.getenv("DEBUG", "false") == "true":
|
||||
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
||||
else:
|
||||
options.add_argument('--headless')
|
||||
|
||||
# options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
||||
# options.set_preference("browser.download.folderList", 2)
|
||||
# # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
||||
# # options.set_preference("pdfjs.disabled", True)
|
||||
# options.set_preference("browser.download.dir", config["default_download_path"])
|
||||
|
||||
self.logger.info("Starting chrome driver")
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
|
||||
command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
|
||||
options = options,
|
||||
# can't set log path...
|
||||
browser_profile = profile
|
||||
)
|
||||
|
||||
self.running = True
|
||||
self._running = True
|
||||
|
||||
def autostart(self):
|
||||
if not self.running:
|
||||
self.start() # relaunch the dl util
|
||||
|
||||
def finish(self):
|
||||
if self.running:
|
||||
self.logger.info("Exiting chrome driver")
|
||||
try:
|
||||
self.driver.quit()
|
||||
time.sleep(10)
|
||||
except:
|
||||
self.logger.critical("Connection to the driver broke off")
|
||||
self.running = False
|
||||
else:
|
||||
self.logger.info("Chrome driver not yet running")
|
||||
self.logger.info("Exiting Geckodriver")
|
||||
try:
|
||||
self.driver.quit()
|
||||
time.sleep(10)
|
||||
except:
|
||||
self.logger.critical("Connection to the driver broke off")
|
||||
self._running = False
|
||||
|
||||
|
||||
@driver_running
|
||||
def download(self, article_object):
|
||||
sleep_time = 2
|
||||
self.autostart()
|
||||
sleep_time = int(config["browser_print_delay"])
|
||||
url = article_object.article_url
|
||||
|
||||
try:
|
||||
@ -89,20 +91,17 @@ class PDFDownloader:
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
|
||||
|
||||
if url[-4:] == ".pdf":
|
||||
# according to the browser preferences, calling the url will open pdfjs.
|
||||
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
|
||||
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
|
||||
success = self.get_exisiting_pdf(url, dst)
|
||||
else:
|
||||
success = self.get_new_pdf(dst)
|
||||
|
||||
|
||||
if success:
|
||||
article_object.file_name = fname
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object # this change is saved later by the external caller
|
||||
return article_object # this change is saved later by the external caller
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, url, dst):
|
||||
@ -134,9 +133,26 @@ class PDFDownloader:
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed, because of FS-operation: {e}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
|
||||
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
|
||||
print(reduced_profile_path, full_profile_path)
|
||||
os.mkdir(reduced_profile_path)
|
||||
# copy needed directories
|
||||
dirs = ["extensions", "storage"]
|
||||
for dir in dirs:
|
||||
shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
|
||||
|
||||
# copy needed files
|
||||
files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
|
||||
for f in files:
|
||||
shutil.copy(full_profile_path / f, reduced_profile_path)
|
||||
|
||||
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
|
||||
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
|
||||
return reduced_profile_path
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user