Switched from geckodriver to chrome

This commit is contained in:
Remy Moll 2022-09-18 19:26:55 +02:00
parent 7cf7422b46
commit db161e50c8
13 changed files with 135 additions and 61 deletions

View File

@ -76,10 +76,24 @@ docker compose --env-file env/production logs -f news_fetch # follows along with
docker compose --env-file env/production down docker compose --env-file env/production down
``` ```
### First run:
> The program relies on a functioning chrome profile!
For the first run ever, run
`./launch init`
This will generate a new chrome profile under `coss_archiving/dependencies/news_fetch.profile`.
You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Verify the profile (under chrome://profile-internals).
Now install two addons: Idontcareaboutcookies (from chrome://extensions) and Bypass Paywalls (from https://github.com/iamadamdev/bypass-paywalls-chrome). The script already downloaded the file, so just enable developer mode, click load from unpacked, go to `/user_data/dependencies/news_fetch.profile`, select the directory `bypass-paywalls-chrome-master`.
Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `./launch init`.
## Building ## Building
> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly re build the docker image! This is also crucial to update the code itself. > The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers.
In docker compose, run In docker compose, run

View File

@ -0,0 +1,16 @@
if [ -d "/user_data/news_fetch.profile" ]
then
echo "Profile already exists, skipping creation"
else
google-chrome &
sleep 5
cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
PID=$(pidof chrome)
echo "Now killing processes with pid:" $PID
kill $PID
cd /user_data/news_fetch.profile
wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
unzip master
fi
google-chrome --user-data-dir=/user_data/news_fetch.profile

View File

@ -33,13 +33,17 @@ services:
- /sync/nas_sync.config - /sync/nas_sync.config
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
image: ${GECKODRIVER_IMG} image: selenium/standalone-chrome:latest
shm_size: 2gb shm_size: 2gb
environment: environment:
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
- START_XVFB=${HEADFULL-false} - START_XVFB=${HEADFULL-false}
- SE_VNC_NO_PASSWORD=1 - SE_VNC_NO_PASSWORD=1
volumes:
- ${CONTAINER_DATA}/dependencies:/user_data
- ${CODE:-/dev/null}:/code
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
expose: ["4444"] # exposed to other docker-compose services only expose: ["4444"] # exposed to other docker-compose services only
ports: ports:
- 7900:7900 # port for webvnc - 7900:7900 # port for webvnc
@ -59,7 +63,7 @@ services:
depends_on: # when using docker compose run news_fetch, the dependencies are started as well depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- nas_sync - nas_sync
- geckodriver - chrome
- db_passthrough - db_passthrough
volumes: volumes:
@ -68,6 +72,7 @@ services:
environment: environment:
- DEBUG=${DEBUG} - DEBUG=${DEBUG}
- UNAME=${UNAME} - UNAME=${UNAME}
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile
# stdin_open: ${INTERACTIVE:-false} # docker run -i # stdin_open: ${INTERACTIVE:-false} # docker run -i
# tty: ${INTERACTIVE:-false} # docker run -t # tty: ${INTERACTIVE:-false} # docker run -t
@ -76,7 +81,7 @@ services:
news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such) news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such)
build: news_check build: news_check
image: news_check:latest image: news_check:latest
user: 1000:1000 # since the app writes files to the local filesystem, it must be run as the current user user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
depends_on: depends_on:
- db_passthrough - db_passthrough
volumes: volumes:

3
env/debug vendored
View File

@ -1,9 +1,8 @@
# Runs in a debugging mode, does not launch anything at all but starts a bash process # Runs in a debugging mode, does not launch anything at all but starts a bash process
export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy export UNAME=remy
export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
export DEBUG=true export DEBUG=true
export HEADFULL=true export HEADFULL=true
export CODE=./ export CODE=./

2
env/production vendored
View File

@ -3,5 +3,5 @@
CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy export UNAME=remy
export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 export U_ID=1000
export DEBUG=false export DEBUG=false

16
launch
View File

@ -8,9 +8,7 @@ echo "Bash script launching COSS_ARCHIVING..."
# CHANGE ME ONCE! # CHANGE ME ONCE!
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy export UNAME=remy
# CHANGE ME WHEN UPDATING FIREFOX export U_ID=1000
export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
# version must be >= than the one on the host or firefox will not start (because of mismatched config)
if [[ $1 == "debug" ]] if [[ $1 == "debug" ]]
then then
@ -18,8 +16,8 @@ then
export HEADFULL=true export HEADFULL=true
export CODE=./ export CODE=./
export ENTRYPOINT=/bin/bash export ENTRYPOINT=/bin/bash
# since service ports does not open ports on implicitly started containers, also start geckodriver: # since service ports does not open ports on implicitly started containers, also start chrome:
docker compose up -d geckodriver docker compose up -d chrome
elif [[ $1 == "production" ]] elif [[ $1 == "production" ]]
then then
export DEBUG=false export DEBUG=false
@ -32,6 +30,14 @@ elif [[ $1 == "down" ]]
then then
docker compose stop docker compose stop
exit 0 exit 0
elif [[ $1 == "init" ]]
then
export CODE=./
export HEADFULL=true
docker compose up -d chrome
sleep 5
docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
else else
echo "Please specify the execution mode (debug/production/build) as the first argument" echo "Please specify the execution mode (debug/production/build) as the first argument"
exit 1 exit 1

View File

@ -26,5 +26,4 @@ local_storage_path: /app/containerdata/files
debug_storage_path: /app/containerdata/debug/ debug_storage_path: /app/containerdata/debug/
default_download_path: /app/containerdata/tmp default_download_path: /app/containerdata/tmp
remote_storage_path: /helbing_support/Files RM/Archiving remote_storage_path: /helbing_support/Files RM/Archiving
browser_profile_path: /app/containerdata/dependencies/7hlyfqxt.Auto News browser_profile_path: /user_data/news_fetch.profile
blacklisted_href_domains: ["google.", "facebook."]

56
misc/youtube_batch.py Normal file
View File

@ -0,0 +1,56 @@
import youtube_dl
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import time
urls = [
"https://www.youtube.com/watch?v=R4h_yiDIuQE",
"https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
"https://www.youtube.com/watch?v=8eYBcASQIQI",
"https://www.thingiverse.com/thing:5463267",
"https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
"https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
"https://www.youtube.com/watch?v=bQQn_vET4ys",
"https://www.youtube.com/watch?v=6FqNctiO06E",
"https://www.youtube.com/watch?v=ImnuJgj8XJo",
"https://www.youtube.com/watch?v=4QZQtSqaC34",
"https://www.youtube.com/watch?v=cW4qIjPMGkQ",
"https://www.youtube.com/watch?v=QWsUGpKfP8A",
"https://www.youtube.com/watch?v=a0PwEwLG9No",
"https://www.youtube.com/watch?v=Hd3lnWVIIpo",
"https://www.youtube.com/watch?v=JNtdAp-BdzI",
"https://en.wikipedia.org/wiki/Viktor_Schauberger",
"https://de.wikipedia.org/wiki/Viktor_Schauberger",
]
def post_download_hook(ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
print(file_loc)
def save_video(url):
"""Saves video accoring to url and save path"""
ydl_opts = {
'format': 'best[height<=720]',
# 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
'progress_hooks': [post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# article file name is updated in self.post_download_hook
except Exception as e:
print(f"Youtube download crashed: {e}")
# for url in urls:
# save_video(url)
for url in urls:
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
print(archive_url)
time.sleep(20)

View File

@ -9,12 +9,15 @@
{name: 'Language', value: article_data.language}, {name: 'Language', value: article_data.language},
{name: 'Authors', value: article_data.authors}, {name: 'Authors', value: article_data.authors},
{name: "Related", value: article_data.related}, {name: "Related", value: article_data.related},
{name: "Sent", value: article_data.sent},
] ]
</script> </script>
<style> <style>
a { td {
word-break: break-all; overflow-wrap: break-word;
word-wrap: break-word;
word-break: break-word;
} }
</style> </style>
<div class="card bg-neutral-300 shadow-xl overflow-x-auto"> <div class="card bg-neutral-300 shadow-xl overflow-x-auto">
@ -31,9 +34,9 @@
{#each status_items as item} {#each status_items as item}
<tr> <tr>
<td>{ item.name }</td> <td>{ item.name }</td>
{#if item.value != ""} {#if (item.value != "" || status_items.valze == false) }
{#if item.name == "Url"} {#if item.name == "Url"}
<td class='bg-emerald-200'><a href="{ item.value }">{ item.value }</a></td> <td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td>
{:else} {:else}
<td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td> <td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
{/if} {/if}

View File

@ -5,15 +5,8 @@ ENV TZ Europe/Zurich
RUN apt-get update && apt-get install -y ghostscript RUN apt-get update && apt-get install -y ghostscript
# for compression of pdfs # for compression of pdfs
# RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
# id mapped to local user
# home directory needed for pip package installation
# RUN export PATH=/home/autonews/.local/bin:$PATH
RUN mkdir -p /app/auto_news RUN mkdir -p /app/auto_news
# RUN chown -R autonews:autonews /app
# USER autonews
COPY requirements.txt /app/requirements.txt COPY requirements.txt /app/requirements.txt
RUN python3 -m pip install -r /app/requirements.txt RUN python3 -m pip install -r /app/requirements.txt

View File

@ -6,10 +6,8 @@ import base64
import requests import requests
from selenium import webdriver from selenium import webdriver
import configuration import configuration
import json
config = configuration.main_config["DOWNLOADS"] config = configuration.main_config["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader: class PDFDownloader:
@ -21,42 +19,31 @@ class PDFDownloader:
def start(self): def start(self):
self.finish() # clear up self.finish() # clear up
options = webdriver.FirefoxOptions() options = webdriver.ChromeOptions()
options.profile = config["browser_profile_path"] options.add_argument(f"user-data-dir={config['browser_profile_path']}")
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work options.add_argument('--headless')
if os.getenv("DEBUG", "false") == "true": # if os.getenv("DEBUG", "false") == "true":
self.logger.warning("Opening browser GUI because of 'DEBUG=true'") # self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
else: # else:
options.add_argument('--headless')
options.set_preference('print.save_as_pdf.links.enabled', True) # options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already # # Just save if the filetype is pdf already
# TODO: this is not working right now # # TODO: this is not working right now
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2) # options.set_preference("browser.download.folderList", 2)
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# options.set_preference("pdfjs.disabled", True) # # options.set_preference("pdfjs.disabled", True)
options.set_preference("browser.download.dir", config["default_download_path"]) # options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver") self.logger.info("Starting chrome driver")
# peviously, in a single docker image:
# self.driver = webdriver.Firefox(
# options = options,
# service = webdriver.firefox.service.Service(
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
# ))
self.driver = webdriver.Remote( self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444', command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
options = options, options = options,
# can't set log path... # can't set log path...
) )
residues = os.listdir(config["default_download_path"])
for res in residues:
os.remove(os.path.join(config["default_download_path"], res))
self.running = True self.running = True
def autostart(self): def autostart(self):
@ -65,7 +52,7 @@ class PDFDownloader:
def finish(self): def finish(self):
if self.running: if self.running:
self.logger.info("Exiting gecko driver") self.logger.info("Exiting chrome driver")
try: try:
self.driver.quit() self.driver.quit()
time.sleep(10) time.sleep(10)
@ -73,7 +60,7 @@ class PDFDownloader:
self.logger.critical("Connection to the driver broke off") self.logger.critical("Connection to the driver broke off")
self.running = False self.running = False
else: else:
self.logger.info("Gecko driver not yet running") self.logger.info("Chrome driver not yet running")
def download(self, article_object): def download(self, article_object):
sleep_time = 2 sleep_time = 2
@ -153,8 +140,6 @@ class PDFDownloader:
def make_path_unique(path): def make_path_unique(path):
fname, ending = os.path.splitext(path) fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S") fname += datetime.datetime.now().strftime("%d-%H%M%S")

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import youtube_dl import youtube_dl
import os import os
import logging import logging

View File

@ -1,4 +1,3 @@
import time
from waybackpy import WaybackMachineSaveAPI # upload to archive.org from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)