Better structure
This commit is contained in:
0
app/utils_worker/_init__.py
Normal file
0
app/utils_worker/_init__.py
Normal file
33
app/utils_worker/compress/runner.py
Normal file
33
app/utils_worker/compress/runner.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import subprocess
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import configuration
|
||||
config = configuration.parsed["DOWNLOADS"]
|
||||
|
||||
shrink_sizes = []
|
||||
|
||||
def shrink_pdf(article):
|
||||
initial_size = os.path.getsize(article.save_path + article.file_name)
|
||||
c = subprocess.run(
|
||||
["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
if c.returncode == 0:
|
||||
m = subprocess.run(
|
||||
["mv", "-f", f"{config['default_download_path']}/compressed.pdf", article.save_path + article.file_name]
|
||||
)
|
||||
if m.returncode == 0:
|
||||
final_size = os.path.getsize(article.save_path + article.file_name)
|
||||
shrink_sizes.append(initial_size - final_size)
|
||||
logger.info(f"Compression worked. Avg shrinkage: {sum(shrink_sizes)/len(shrink_sizes) / 1000} (kb)")
|
||||
return article # even though no modifications were made
|
||||
else:
|
||||
logger.error(f"Compression ran but I could not copy back the file {m.stderr.decode()} - {m.stdout.decode()}")
|
||||
|
||||
|
||||
else:
|
||||
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
|
||||
|
||||
return article
|
0
app/utils_worker/download/__init__.py
Normal file
0
app/utils_worker/download/__init__.py
Normal file
158
app/utils_worker/download/browser.py
Normal file
158
app/utils_worker/download/browser.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import base64
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
import configuration
|
||||
|
||||
config = configuration.parsed["DOWNLOADS"]
|
||||
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
"""Saves a given url. Fills the object it got as a parameter"""
|
||||
logger = logging.getLogger(__name__)
|
||||
# status-variable for restarting:
|
||||
running = False
|
||||
|
||||
def start(self):
|
||||
options=Options()
|
||||
options.profile = config["browser_profile_path"]
|
||||
# TODO: Get headless mode interactively
|
||||
options.add_argument('--headless')
|
||||
# options.add_argument("--disable-infobars")
|
||||
# options.set_preference("javascript.enabled", False)
|
||||
# options.add_argument("--disable-popup-blocking")
|
||||
# Print to pdf
|
||||
options.set_preference("print_printer", "Mozilla Save to PDF")
|
||||
options.set_preference("print.always_print_silent", True)
|
||||
options.set_preference("print.show_print_progress", False)
|
||||
options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
||||
# Save existing pdf
|
||||
options.set_preference("browser.download.folderList", 2)
|
||||
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
||||
# options.set_preference("pdfjs.disabled", True)
|
||||
options.set_preference("browser.download.dir", config["default_download_path"])
|
||||
|
||||
self.logger.info("Now Starting gecko driver")
|
||||
self.driver = webdriver.Firefox(options=options)
|
||||
|
||||
residues = os.listdir(config["default_download_path"])
|
||||
for res in residues:
|
||||
os.remove(os.path.join(config["default_download_path"], res))
|
||||
|
||||
self.running = True
|
||||
|
||||
def autostart(self):
|
||||
if not self.running:
|
||||
self.start() # relaunch the dl util
|
||||
|
||||
def finish(self):
|
||||
self.driver.quit()
|
||||
self.running = False
|
||||
|
||||
|
||||
def download(self, article_object):
|
||||
sleep_time = 1
|
||||
self.autostart()
|
||||
url = article_object.article_url
|
||||
|
||||
# arbitrary bug fixes:
|
||||
if "focus.de" in url or "bloomberg.com" in url:
|
||||
url = url.replace("https://", "https://outline.com/")
|
||||
sleep_time += 5
|
||||
try:
|
||||
self.driver.get(url)
|
||||
except Exception as e:
|
||||
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
|
||||
self.finish()
|
||||
return article_object # without changes
|
||||
|
||||
time.sleep(sleep_time)
|
||||
# leave the page time to do any funky business
|
||||
|
||||
# in the mean time, get a page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = self.driver.title.replace(".pdf","")
|
||||
# will be propagated to dst as well
|
||||
|
||||
fname = article_object.fname_template
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
if os.path.exists(dst):
|
||||
fname = make_path_unique(fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
|
||||
|
||||
if url[-4:] == ".pdf":
|
||||
# according to the browser preferences, calling the url will open pdfjs.
|
||||
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
|
||||
success = self.get_exisiting_pdf(url, dst)
|
||||
else:
|
||||
success = self.get_new_pdf(dst)
|
||||
|
||||
|
||||
if success:
|
||||
article_object.file_name = fname
|
||||
article_object.set_references = self.get_references()
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object # this change is saved later manually
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, url, dst):
|
||||
try:
|
||||
r = requests.get(url)
|
||||
bytes = r.content
|
||||
except:
|
||||
return False
|
||||
return self.get_new_pdf(dst, other_bytes=bytes)
|
||||
|
||||
|
||||
def get_new_pdf(self, dst, other_bytes=None):
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
|
||||
if other_bytes is None:
|
||||
try:
|
||||
result = self.driver.print_page()
|
||||
bytes = base64.b64decode(result, validate=True)
|
||||
except:
|
||||
self.logger.error("Failed, probably because the driver went extinct.")
|
||||
return False
|
||||
else:
|
||||
bytes = other_bytes
|
||||
|
||||
try:
|
||||
with open(dst, "wb+") as f:
|
||||
f.write(bytes)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed, because of FS-operation: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_references(self):
|
||||
try:
|
||||
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
|
||||
except:
|
||||
hrefs = []
|
||||
# TODO TEST THIS
|
||||
hrefs = [h for h in hrefs \
|
||||
if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
|
||||
] # filter a tiny bit at least
|
||||
return hrefs
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def make_path_unique(path):
|
||||
fname, ending = os.path.splitext(path)
|
||||
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
||||
return fname + ending
|
0
app/utils_worker/download/runner.py
Normal file
0
app/utils_worker/download/runner.py
Normal file
33
app/utils_worker/download/youtube.py
Normal file
33
app/utils_worker/download/youtube.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import logging
|
||||
import os
|
||||
from pytube import YouTube
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def save_video(article_object):
|
||||
"""Saves video accoring to url and save path"""
|
||||
url = article_object.article_url
|
||||
logger.info("Saving new video")
|
||||
try:
|
||||
yt = YouTube(url)
|
||||
streams = yt.streams.filter(progressive=True).order_by('resolution')
|
||||
except Exception as e:
|
||||
article_object.file_name = "ERROR: {}".format(e)
|
||||
return article_object
|
||||
|
||||
if streams: # if it's not empty
|
||||
vid = streams[-1]
|
||||
article_object.source_name = "youtube.com"
|
||||
article_object.title = yt.title
|
||||
file_path = os.path.join(article_object.save_path, article_object.fname_template)
|
||||
try:
|
||||
vid.download(file_path)
|
||||
article_object.file_name = article_object.fname_template
|
||||
except Exception as e:
|
||||
logger.error(f"Youtube download crashed: {e}")
|
||||
article_object.file_name = "Error while downloading"
|
||||
else:
|
||||
article_object.file_name = "No streams available"
|
||||
|
||||
return article_object
|
60
app/utils_worker/fetch/runner.py
Normal file
60
app/utils_worker/fetch/runner.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from newspaper import Article
|
||||
from urllib.parse import urlparse
|
||||
from htmldate import find_date
|
||||
import datetime
|
||||
import logging
|
||||
logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
|
||||
logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
|
||||
logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
|
||||
logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
|
||||
logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
|
||||
logger = logging.getLogger("fetch")
|
||||
|
||||
|
||||
class NewspaperDummy():
|
||||
title = "Error while running fetch"
|
||||
summary = "Error while running fetch"
|
||||
text = "Error while running fetch"
|
||||
authors = []
|
||||
keywords = []
|
||||
|
||||
|
||||
def get_description(article_object):
|
||||
url = article_object.article_url
|
||||
website = urlparse(url).netloc
|
||||
article_object.source_name = website
|
||||
try:
|
||||
pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
|
||||
except: # other file types
|
||||
pub_date = datetime.datetime(year=1900, month=1, day=1)
|
||||
article_object.pub_date = pub_date
|
||||
|
||||
fallback = NewspaperDummy()
|
||||
try:
|
||||
news_article = Article(url)
|
||||
news_article.download()
|
||||
news_article.parse()
|
||||
except:
|
||||
news_article = fallback
|
||||
|
||||
|
||||
if news_article.title:
|
||||
title = news_article.title
|
||||
else:
|
||||
title = fallback.title
|
||||
|
||||
|
||||
if news_article.summary:
|
||||
summary = news_article.summary
|
||||
elif news_article.text:
|
||||
ind = min(500, len(news_article.text))
|
||||
summary = news_article.text[:ind] + "..."
|
||||
else:
|
||||
summary = fallback.summary
|
||||
|
||||
article_object.title = title
|
||||
article_object.summary = summary
|
||||
article_object.set_authors(news_article.authors)
|
||||
article_object.set_keywords(news_article.keywords)
|
||||
|
||||
return article_object
|
18
app/utils_worker/upload/runner.py
Normal file
18
app/utils_worker/upload/runner.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def upload_to_archive(article_object):
|
||||
"""uploads to archive.org and returns the archived url"""
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
|
||||
url = article_object.article_url
|
||||
try:
|
||||
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||
archive_url = wayback.save()
|
||||
logger.info(f"{url} uploaded to archive successfully")
|
||||
article_object.archive_url = archive_url
|
||||
except Exception as e:
|
||||
article_object.archive_url = "Error while uploading: {}".format(e)
|
||||
logger.error(f"Error while generating new url: {e}")
|
||||
|
||||
return article_object
|
43
app/utils_worker/worker_template.py
Normal file
43
app/utils_worker/worker_template.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from threading import Thread
|
||||
import time
|
||||
import logging
|
||||
# logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TemplateWorker(Thread):
|
||||
"""Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
target = self._queue_processor # will be executed on Worker.start()
|
||||
group = kwargs.get("group", None)
|
||||
name = kwargs.get("name", None)
|
||||
|
||||
super().__init__(group=group, target=target, name=name)
|
||||
self._article_queue = []
|
||||
self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
|
||||
|
||||
|
||||
def process(self, article_watcher):
|
||||
self._article_queue.append(article_watcher)#.article_model.article_url)
|
||||
|
||||
|
||||
def _queue_processor(self):
|
||||
"""This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
|
||||
while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
|
||||
if len(self._article_queue) == 0:
|
||||
time.sleep(5)
|
||||
else:
|
||||
article_watcher = self._article_queue.pop(0)
|
||||
self.logger.info(f"{self.__class__.__name__} is now processing article ({len(self._article_queue)} in queue)")
|
||||
self._handle_article(article_watcher)
|
||||
|
||||
|
||||
def _handle_article(self, article_watcher, action=None):
|
||||
# TODO Overload in children classes
|
||||
if action is None:
|
||||
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
|
||||
else:
|
||||
article = article_watcher.article
|
||||
article = action(article) # action updates the article object but does not save the change
|
||||
article.save()
|
60
app/utils_worker/workers.py
Normal file
60
app/utils_worker/workers.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from .worker_template import TemplateWorker
|
||||
from .download.browser import PDFDownloader
|
||||
from .download.youtube import save_video
|
||||
from .fetch.runner import get_description
|
||||
from .upload.runner import upload_to_archive as run_upload
|
||||
from .compress.runner import shrink_pdf
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DownloadWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
self.dl_runner = PDFDownloader().download
|
||||
self.yt_runner = save_video
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
article = article_watcher.article
|
||||
u = article.article_url
|
||||
|
||||
if "youtu.be/" in u or "youtube.com/" in u:
|
||||
action = self.yt_runner
|
||||
else:
|
||||
action = self.dl_runner
|
||||
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.download_completed = True
|
||||
|
||||
|
||||
|
||||
class FetchWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = get_description # function
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.fetch_completed = True
|
||||
|
||||
|
||||
|
||||
class UploadWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = run_upload # function
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.upload_completed = True
|
||||
|
||||
|
||||
|
||||
class CompressWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = shrink_pdf
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.compression_completed = True
|
Reference in New Issue
Block a user