Working and up to date. WIP misc manual actions
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import configuration
|
||||
@@ -8,26 +10,35 @@ config = configuration.parsed["DOWNLOADS"]
|
||||
shrink_sizes = []
|
||||
|
||||
def shrink_pdf(article):
|
||||
initial_size = os.path.getsize(article.save_path + article.file_name)
|
||||
if article.file_name[-4:] != ".pdf":
|
||||
article_loc = Path(article.save_path) / article.file_name
|
||||
initial_size = article_loc.stat().st_size
|
||||
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
|
||||
|
||||
if article_loc.suffix != "pdf":
|
||||
return article # it probably was a youtube video
|
||||
|
||||
c = subprocess.run(
|
||||
["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f"{article.save_path + article.file_name}"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
[
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dPDFSETTINGS=/screen",
|
||||
"-dNOPAUSE",
|
||||
"-dBATCH",
|
||||
f"-sOutputFile={compressed_tmp}",
|
||||
f"{article_loc}"
|
||||
],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
|
||||
if c.returncode == 0:
|
||||
m = subprocess.run(
|
||||
["mv", "-f", f"{config['default_download_path']}/compressed.pdf", article.save_path + article.file_name]
|
||||
)
|
||||
if m.returncode == 0:
|
||||
final_size = os.path.getsize(article.save_path + article.file_name)
|
||||
shrink_sizes.append(initial_size - final_size)
|
||||
logger.info(f"Compression worked. Avg shrinkage: {sum(shrink_sizes)/len(shrink_sizes) / 1000} (kb)")
|
||||
return article # even though no modifications were made
|
||||
else:
|
||||
logger.error(f"Compression ran but I could not copy back the file {m.stderr.decode()} - {m.stdout.decode()}")
|
||||
try:
|
||||
os.replace(compressed_tmp, article_loc)
|
||||
except OSError as e:
|
||||
logger.error(f"Compression ran but I could not copy back the file {e}")
|
||||
|
||||
final_size = article_loc.stat().st_size
|
||||
shrink_sizes.append(initial_size - final_size)
|
||||
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
|
||||
|
||||
|
||||
else:
|
||||
|
@@ -2,6 +2,7 @@ import time
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import base64
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
@@ -21,18 +22,19 @@ class PDFDownloader:
|
||||
def start(self):
|
||||
options=Options()
|
||||
options.profile = config["browser_profile_path"]
|
||||
# TODO: Get headless mode interactively
|
||||
options.add_argument('--headless')
|
||||
# options.add_argument("--disable-infobars")
|
||||
# options.set_preference("javascript.enabled", False)
|
||||
# options.add_argument("--disable-popup-blocking")
|
||||
if "notheadless" in sys.argv:
|
||||
self.logger.warning("Opening browser GUI because of Argument 'notheadless'")
|
||||
else:
|
||||
options.add_argument('--headless')
|
||||
|
||||
# Print to pdf
|
||||
options.set_preference("print_printer", "Mozilla Save to PDF")
|
||||
options.set_preference("print.always_print_silent", True)
|
||||
options.set_preference("print.show_print_progress", False)
|
||||
options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||
|
||||
# Just save if the filetype is pdf already, does not work!
|
||||
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
||||
# Save existing pdf
|
||||
options.set_preference("browser.download.folderList", 2)
|
||||
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
||||
# options.set_preference("pdfjs.disabled", True)
|
||||
@@ -140,7 +142,7 @@ class PDFDownloader:
|
||||
hrefs = [h for h in hrefs \
|
||||
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
|
||||
] # filter a tiny bit at least
|
||||
self.logger.info(f"Hrefs result (before:{len_old}, after: {len(hrefs)})")
|
||||
self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
|
||||
return hrefs
|
||||
|
||||
|
||||
|
@@ -11,61 +11,52 @@ logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
|
||||
logger = logging.getLogger("fetch")
|
||||
|
||||
|
||||
class NewspaperDummy():
|
||||
title = "Error while running fetch"
|
||||
summary = "Error while running fetch"
|
||||
text = "Error while running fetch"
|
||||
meta_lang = ""
|
||||
authors = []
|
||||
keywords = []
|
||||
|
||||
|
||||
def get_description(article_object):
|
||||
url = article_object.article_url
|
||||
website = urlparse(url).netloc
|
||||
article_object.source_name = website
|
||||
|
||||
try:
|
||||
pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
|
||||
article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
|
||||
except: # other file types
|
||||
pub_date = datetime.datetime(year=1900, month=1, day=1)
|
||||
article_object.pub_date = pub_date
|
||||
article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
|
||||
|
||||
fallback = NewspaperDummy()
|
||||
try:
|
||||
news_article = Article(url)
|
||||
news_article.download()
|
||||
news_article.parse()
|
||||
except:
|
||||
news_article = fallback
|
||||
|
||||
if news_article.title:
|
||||
title = news_article.title
|
||||
else:
|
||||
title = fallback.title
|
||||
|
||||
if news_article.summary:
|
||||
summary = news_article.summary
|
||||
elif news_article.text:
|
||||
ind = min(500, len(news_article.text))
|
||||
summary = news_article.text[:ind] + "..."
|
||||
else:
|
||||
summary = fallback.summary
|
||||
news_article = object() # fallback value
|
||||
|
||||
try:
|
||||
print(f"lang: {news_article.meta_lang}")
|
||||
except:
|
||||
print("could not access meta_lang")
|
||||
|
||||
if news_article.meta_lang:
|
||||
lang = news_article.meta_lang
|
||||
else:
|
||||
lang = ""
|
||||
article_object.title = news_article.title
|
||||
except AttributeError:
|
||||
article_object.title = "Error while running fetch"
|
||||
|
||||
article_object.title = title
|
||||
article_object.summary = summary
|
||||
article_object.language = lang
|
||||
article_object.set_authors(news_article.authors)
|
||||
article_object.set_keywords(news_article.keywords)
|
||||
try:
|
||||
if article_object.summary:
|
||||
article_object.summary = news_article.summary
|
||||
elif news_article.text:
|
||||
ind = min(500, len(news_article.text))
|
||||
article_object.summary = news_article.text[:ind] + "..."
|
||||
else:
|
||||
article_object.summary = ""
|
||||
except AttributeError:
|
||||
article_object.summary = ""
|
||||
|
||||
try:
|
||||
article_object.language = news_article.meta_lang
|
||||
except AttributeError:
|
||||
article_object.language = ""
|
||||
|
||||
try:
|
||||
article_object.set_authors(news_article.authors)
|
||||
except AttributeError:
|
||||
pass # list would have been empty anyway
|
||||
|
||||
try:
|
||||
article_object.set_keywords(news_article.keywords)
|
||||
except AttributeError:
|
||||
pass # list would have been empty anyway
|
||||
|
||||
return article_object
|
||||
|
@@ -28,7 +28,7 @@ class TemplateWorker(Thread):
|
||||
time.sleep(5)
|
||||
else:
|
||||
article_watcher = self._article_queue.pop(0)
|
||||
self.logger.info(f"{self.__class__.__name__} is now processing article ({len(self._article_queue)} in queue)")
|
||||
self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}")
|
||||
self._handle_article(article_watcher)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user