Switched from geckodriver to chrome
This commit is contained in:
parent
7cf7422b46
commit
db161e50c8
16
README.md
16
README.md
@ -76,10 +76,24 @@ docker compose --env-file env/production logs -f news_fetch # follows along with
|
|||||||
docker compose --env-file env/production down
|
docker compose --env-file env/production down
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### First run:
|
||||||
|
> The program relies on a functioning chrome profile!
|
||||||
|
|
||||||
|
For the first run ever, run
|
||||||
|
|
||||||
|
`./launch init`
|
||||||
|
|
||||||
|
This will generate a new chrome profile under `coss_archiving/dependencies/news_fetch.profile`.
|
||||||
|
You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Verify the profile (under chrome://profile-internals).
|
||||||
|
|
||||||
|
Now install two addons: Idontcareaboutcookies (from chrome://extensions) and Bypass Paywalls (from https://github.com/iamadamdev/bypass-paywalls-chrome). The script already downloaded the file, so just enable developer mode, click load from unpacked, go to `/user_data/dependencies/news_fetch.profile`, select the directory `bypass-paywalls-chrome-master`.
|
||||||
|
|
||||||
|
Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `./launch init`.
|
||||||
|
|
||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
|
||||||
> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly re build the docker image! This is also crucial to update the code itself.
|
> The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers.
|
||||||
|
|
||||||
In docker compose, run
|
In docker compose, run
|
||||||
|
|
||||||
|
16
chrome/change_configuration.sh
Normal file
16
chrome/change_configuration.sh
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
if [ -d "/user_data/news_fetch.profile" ]
|
||||||
|
then
|
||||||
|
echo "Profile already exists, skipping creation"
|
||||||
|
else
|
||||||
|
google-chrome &
|
||||||
|
sleep 5
|
||||||
|
cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
|
||||||
|
PID=$(pidof chrome)
|
||||||
|
echo "Now killing processes with pid:" $PID
|
||||||
|
kill $PID
|
||||||
|
cd /user_data/news_fetch.profile
|
||||||
|
wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
|
||||||
|
unzip master
|
||||||
|
fi
|
||||||
|
|
||||||
|
google-chrome --user-data-dir=/user_data/news_fetch.profile
|
@ -33,13 +33,17 @@ services:
|
|||||||
- /sync/nas_sync.config
|
- /sync/nas_sync.config
|
||||||
|
|
||||||
|
|
||||||
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
||||||
image: ${GECKODRIVER_IMG}
|
image: selenium/standalone-chrome:latest
|
||||||
shm_size: 2gb
|
shm_size: 2gb
|
||||||
environment:
|
environment:
|
||||||
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
||||||
- START_XVFB=${HEADFULL-false}
|
- START_XVFB=${HEADFULL-false}
|
||||||
- SE_VNC_NO_PASSWORD=1
|
- SE_VNC_NO_PASSWORD=1
|
||||||
|
volumes:
|
||||||
|
- ${CONTAINER_DATA}/dependencies:/user_data
|
||||||
|
- ${CODE:-/dev/null}:/code
|
||||||
|
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
|
||||||
expose: ["4444"] # exposed to other docker-compose services only
|
expose: ["4444"] # exposed to other docker-compose services only
|
||||||
ports:
|
ports:
|
||||||
- 7900:7900 # port for webvnc
|
- 7900:7900 # port for webvnc
|
||||||
@ -59,7 +63,7 @@ services:
|
|||||||
|
|
||||||
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
|
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
|
||||||
- nas_sync
|
- nas_sync
|
||||||
- geckodriver
|
- chrome
|
||||||
- db_passthrough
|
- db_passthrough
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
@ -68,6 +72,7 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- DEBUG=${DEBUG}
|
- DEBUG=${DEBUG}
|
||||||
- UNAME=${UNAME}
|
- UNAME=${UNAME}
|
||||||
|
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
|
||||||
entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile
|
entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile
|
||||||
# stdin_open: ${INTERACTIVE:-false} # docker run -i
|
# stdin_open: ${INTERACTIVE:-false} # docker run -i
|
||||||
# tty: ${INTERACTIVE:-false} # docker run -t
|
# tty: ${INTERACTIVE:-false} # docker run -t
|
||||||
@ -76,7 +81,7 @@ services:
|
|||||||
news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such)
|
news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such)
|
||||||
build: news_check
|
build: news_check
|
||||||
image: news_check:latest
|
image: news_check:latest
|
||||||
user: 1000:1000 # since the app writes files to the local filesystem, it must be run as the current user
|
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
|
||||||
depends_on:
|
depends_on:
|
||||||
- db_passthrough
|
- db_passthrough
|
||||||
volumes:
|
volumes:
|
||||||
|
3
env/debug
vendored
3
env/debug
vendored
@ -1,9 +1,8 @@
|
|||||||
# Runs in a debugging mode, does not launch anything at all but starts a bash process
|
# Runs in a debugging mode, does not launch anything at all but starts a bash process
|
||||||
|
|
||||||
export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
|
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
||||||
export UNAME=remy
|
export UNAME=remy
|
||||||
|
|
||||||
export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
|
|
||||||
export DEBUG=true
|
export DEBUG=true
|
||||||
export HEADFULL=true
|
export HEADFULL=true
|
||||||
export CODE=./
|
export CODE=./
|
||||||
|
2
env/production
vendored
2
env/production
vendored
@ -3,5 +3,5 @@
|
|||||||
CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
||||||
|
|
||||||
export UNAME=remy
|
export UNAME=remy
|
||||||
export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
|
export U_ID=1000
|
||||||
export DEBUG=false
|
export DEBUG=false
|
||||||
|
16
launch
16
launch
@ -8,9 +8,7 @@ echo "Bash script launching COSS_ARCHIVING..."
|
|||||||
# CHANGE ME ONCE!
|
# CHANGE ME ONCE!
|
||||||
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
||||||
export UNAME=remy
|
export UNAME=remy
|
||||||
# CHANGE ME WHEN UPDATING FIREFOX
|
export U_ID=1000
|
||||||
export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
|
|
||||||
# version must be >= than the one on the host or firefox will not start (because of mismatched config)
|
|
||||||
|
|
||||||
if [[ $1 == "debug" ]]
|
if [[ $1 == "debug" ]]
|
||||||
then
|
then
|
||||||
@ -18,8 +16,8 @@ then
|
|||||||
export HEADFULL=true
|
export HEADFULL=true
|
||||||
export CODE=./
|
export CODE=./
|
||||||
export ENTRYPOINT=/bin/bash
|
export ENTRYPOINT=/bin/bash
|
||||||
# since service ports does not open ports on implicitly started containers, also start geckodriver:
|
# since service ports does not open ports on implicitly started containers, also start chrome:
|
||||||
docker compose up -d geckodriver
|
docker compose up -d chrome
|
||||||
elif [[ $1 == "production" ]]
|
elif [[ $1 == "production" ]]
|
||||||
then
|
then
|
||||||
export DEBUG=false
|
export DEBUG=false
|
||||||
@ -32,6 +30,14 @@ elif [[ $1 == "down" ]]
|
|||||||
then
|
then
|
||||||
docker compose stop
|
docker compose stop
|
||||||
exit 0
|
exit 0
|
||||||
|
elif [[ $1 == "init" ]]
|
||||||
|
then
|
||||||
|
export CODE=./
|
||||||
|
export HEADFULL=true
|
||||||
|
|
||||||
|
docker compose up -d chrome
|
||||||
|
sleep 5
|
||||||
|
docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
|
||||||
else
|
else
|
||||||
echo "Please specify the execution mode (debug/production/build) as the first argument"
|
echo "Please specify the execution mode (debug/production/build) as the first argument"
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -26,5 +26,4 @@ local_storage_path: /app/containerdata/files
|
|||||||
debug_storage_path: /app/containerdata/debug/
|
debug_storage_path: /app/containerdata/debug/
|
||||||
default_download_path: /app/containerdata/tmp
|
default_download_path: /app/containerdata/tmp
|
||||||
remote_storage_path: /helbing_support/Files RM/Archiving
|
remote_storage_path: /helbing_support/Files RM/Archiving
|
||||||
browser_profile_path: /app/containerdata/dependencies/7hlyfqxt.Auto News
|
browser_profile_path: /user_data/news_fetch.profile
|
||||||
blacklisted_href_domains: ["google.", "facebook."]
|
|
||||||
|
56
misc/youtube_batch.py
Normal file
56
misc/youtube_batch.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import youtube_dl
|
||||||
|
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
urls = [
|
||||||
|
"https://www.youtube.com/watch?v=R4h_yiDIuQE",
|
||||||
|
"https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
|
||||||
|
"https://www.youtube.com/watch?v=8eYBcASQIQI",
|
||||||
|
"https://www.thingiverse.com/thing:5463267",
|
||||||
|
"https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
|
||||||
|
"https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
|
||||||
|
"https://www.youtube.com/watch?v=bQQn_vET4ys",
|
||||||
|
"https://www.youtube.com/watch?v=6FqNctiO06E",
|
||||||
|
"https://www.youtube.com/watch?v=ImnuJgj8XJo",
|
||||||
|
"https://www.youtube.com/watch?v=4QZQtSqaC34",
|
||||||
|
"https://www.youtube.com/watch?v=cW4qIjPMGkQ",
|
||||||
|
"https://www.youtube.com/watch?v=QWsUGpKfP8A",
|
||||||
|
"https://www.youtube.com/watch?v=a0PwEwLG9No",
|
||||||
|
"https://www.youtube.com/watch?v=Hd3lnWVIIpo",
|
||||||
|
"https://www.youtube.com/watch?v=JNtdAp-BdzI",
|
||||||
|
"https://en.wikipedia.org/wiki/Viktor_Schauberger",
|
||||||
|
"https://de.wikipedia.org/wiki/Viktor_Schauberger",
|
||||||
|
]
|
||||||
|
def post_download_hook(ret_code):
|
||||||
|
# print(ret_code)
|
||||||
|
if ret_code['status'] == 'finished':
|
||||||
|
file_loc = ret_code["filename"]
|
||||||
|
print(file_loc)
|
||||||
|
|
||||||
|
|
||||||
|
def save_video(url):
|
||||||
|
"""Saves video accoring to url and save path"""
|
||||||
|
ydl_opts = {
|
||||||
|
'format': 'best[height<=720]',
|
||||||
|
# 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||||
|
'progress_hooks': [post_download_hook],
|
||||||
|
'updatetime': False
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||||
|
ydl.download([url])
|
||||||
|
# article file name is updated in self.post_download_hook
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Youtube download crashed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# for url in urls:
|
||||||
|
# save_video(url)
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
|
||||||
|
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||||
|
archive_url = wayback.save()
|
||||||
|
print(archive_url)
|
||||||
|
time.sleep(20)
|
@ -9,12 +9,15 @@
|
|||||||
{name: 'Language', value: article_data.language},
|
{name: 'Language', value: article_data.language},
|
||||||
{name: 'Authors', value: article_data.authors},
|
{name: 'Authors', value: article_data.authors},
|
||||||
{name: "Related", value: article_data.related},
|
{name: "Related", value: article_data.related},
|
||||||
|
{name: "Sent", value: article_data.sent},
|
||||||
]
|
]
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
a {
|
td {
|
||||||
word-break: break-all;
|
overflow-wrap: break-word;
|
||||||
|
word-wrap: break-word;
|
||||||
|
word-break: break-word;
|
||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
<div class="card bg-neutral-300 shadow-xl overflow-x-auto">
|
<div class="card bg-neutral-300 shadow-xl overflow-x-auto">
|
||||||
@ -31,9 +34,9 @@
|
|||||||
{#each status_items as item}
|
{#each status_items as item}
|
||||||
<tr>
|
<tr>
|
||||||
<td>{ item.name }</td>
|
<td>{ item.name }</td>
|
||||||
{#if item.value != ""}
|
{#if (item.value != "" || status_items.valze == false) }
|
||||||
{#if item.name == "Url"}
|
{#if item.name == "Url"}
|
||||||
<td class='bg-emerald-200'><a href="{ item.value }">{ item.value }</a></td>
|
<td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td>
|
||||||
{:else}
|
{:else}
|
||||||
<td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
|
<td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
|
||||||
{/if}
|
{/if}
|
||||||
|
@ -5,15 +5,8 @@ ENV TZ Europe/Zurich
|
|||||||
RUN apt-get update && apt-get install -y ghostscript
|
RUN apt-get update && apt-get install -y ghostscript
|
||||||
# for compression of pdfs
|
# for compression of pdfs
|
||||||
|
|
||||||
# RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
|
|
||||||
# id mapped to local user
|
|
||||||
# home directory needed for pip package installation
|
|
||||||
# RUN export PATH=/home/autonews/.local/bin:$PATH
|
|
||||||
|
|
||||||
|
|
||||||
RUN mkdir -p /app/auto_news
|
RUN mkdir -p /app/auto_news
|
||||||
# RUN chown -R autonews:autonews /app
|
|
||||||
# USER autonews
|
|
||||||
|
|
||||||
COPY requirements.txt /app/requirements.txt
|
COPY requirements.txt /app/requirements.txt
|
||||||
RUN python3 -m pip install -r /app/requirements.txt
|
RUN python3 -m pip install -r /app/requirements.txt
|
||||||
|
@ -6,10 +6,8 @@ import base64
|
|||||||
import requests
|
import requests
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
import configuration
|
import configuration
|
||||||
import json
|
|
||||||
|
|
||||||
config = configuration.main_config["DOWNLOADS"]
|
config = configuration.main_config["DOWNLOADS"]
|
||||||
blacklisted = json.loads(config["blacklisted_href_domains"])
|
|
||||||
|
|
||||||
|
|
||||||
class PDFDownloader:
|
class PDFDownloader:
|
||||||
@ -21,42 +19,31 @@ class PDFDownloader:
|
|||||||
def start(self):
|
def start(self):
|
||||||
self.finish() # clear up
|
self.finish() # clear up
|
||||||
|
|
||||||
options = webdriver.FirefoxOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.profile = config["browser_profile_path"]
|
options.add_argument(f"user-data-dir={config['browser_profile_path']}")
|
||||||
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
|
|
||||||
|
|
||||||
if os.getenv("DEBUG", "false") == "true":
|
|
||||||
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
|
||||||
else:
|
|
||||||
options.add_argument('--headless')
|
options.add_argument('--headless')
|
||||||
|
|
||||||
options.set_preference('print.save_as_pdf.links.enabled', True)
|
# if os.getenv("DEBUG", "false") == "true":
|
||||||
# Just save if the filetype is pdf already
|
# self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
||||||
# TODO: this is not working right now
|
# else:
|
||||||
|
|
||||||
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
# options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||||
options.set_preference("browser.download.folderList", 2)
|
# # Just save if the filetype is pdf already
|
||||||
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
# # TODO: this is not working right now
|
||||||
# options.set_preference("pdfjs.disabled", True)
|
|
||||||
options.set_preference("browser.download.dir", config["default_download_path"])
|
|
||||||
|
|
||||||
self.logger.info("Starting gecko driver")
|
# options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
||||||
# peviously, in a single docker image:
|
# options.set_preference("browser.download.folderList", 2)
|
||||||
# self.driver = webdriver.Firefox(
|
# # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
||||||
# options = options,
|
# # options.set_preference("pdfjs.disabled", True)
|
||||||
# service = webdriver.firefox.service.Service(
|
# options.set_preference("browser.download.dir", config["default_download_path"])
|
||||||
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
|
|
||||||
# ))
|
self.logger.info("Starting chrome driver")
|
||||||
self.driver = webdriver.Remote(
|
self.driver = webdriver.Remote(
|
||||||
command_executor = 'http://geckodriver:4444',
|
command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
|
||||||
options = options,
|
options = options,
|
||||||
# can't set log path...
|
# can't set log path...
|
||||||
)
|
)
|
||||||
|
|
||||||
residues = os.listdir(config["default_download_path"])
|
|
||||||
for res in residues:
|
|
||||||
os.remove(os.path.join(config["default_download_path"], res))
|
|
||||||
|
|
||||||
self.running = True
|
self.running = True
|
||||||
|
|
||||||
def autostart(self):
|
def autostart(self):
|
||||||
@ -65,7 +52,7 @@ class PDFDownloader:
|
|||||||
|
|
||||||
def finish(self):
|
def finish(self):
|
||||||
if self.running:
|
if self.running:
|
||||||
self.logger.info("Exiting gecko driver")
|
self.logger.info("Exiting chrome driver")
|
||||||
try:
|
try:
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
@ -73,7 +60,7 @@ class PDFDownloader:
|
|||||||
self.logger.critical("Connection to the driver broke off")
|
self.logger.critical("Connection to the driver broke off")
|
||||||
self.running = False
|
self.running = False
|
||||||
else:
|
else:
|
||||||
self.logger.info("Gecko driver not yet running")
|
self.logger.info("Chrome driver not yet running")
|
||||||
|
|
||||||
def download(self, article_object):
|
def download(self, article_object):
|
||||||
sleep_time = 2
|
sleep_time = 2
|
||||||
@ -153,8 +140,6 @@ class PDFDownloader:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def make_path_unique(path):
|
def make_path_unique(path):
|
||||||
fname, ending = os.path.splitext(path)
|
fname, ending = os.path.splitext(path)
|
||||||
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
from __future__ import unicode_literals
|
|
||||||
import youtube_dl
|
import youtube_dl
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import time
|
|
||||||
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user