Working and up to date. WIP misc manual actions

This commit is contained in:
Remy Moll 2022-05-24 18:37:30 +02:00
parent 246729d376
commit 878a1dff5d
14 changed files with 142 additions and 182 deletions

View File

@ -1,5 +1,6 @@
FROM python:latest
ENV TZ Euopre/Zurich
RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list
RUN apt-get update && apt-get install -y \
evince \
@ -9,6 +10,7 @@ xauth wget tar firefox \
ghostscript
# for compression
# Download gecko (firefox) driver for selenium
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz
RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver
@ -16,10 +18,19 @@ RUN chmod +x /usr/bin/geckodriver
RUN rm geckodriver-v0.31.0-linux64.tar.gz
RUN echo "127.0.0.1 localhost" >> /etc/hosts
RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
# id mapped to local user
# home directory needed for pip package installation
RUN mkdir -p /app/auto_news
RUN chown -R autonews:autonews /app
USER autonews
COPY requirements.txt /app/
RUN python3 -m pip install -r /app/requirements.txt
RUN mkdir -p /app/auto_news
COPY app /app/auto_news
WORKDIR /app/auto_news

View File

@ -1,9 +0,0 @@
import configuration
from utils_mail import runner
class Dummy:
source_name = "AS"
title = "dummy title"
mail_info = [{"reply_text": "UNFOOO", "file_path":None}]
runner.send(Dummy())

View File

@ -157,8 +157,14 @@ class Coordinator(Thread):
ArticleWatcher(article, workers_manual = workers, notifier = notifier)
def article_complete_notifier(self, article, thread):
self.worker_slack.bot_worker.respond_channel_message(thread)
self.worker_mail.send(article)
if self.worker_slack is None:
logger.warning("Not sending slack notifier")
else:
self.worker_slack.bot_worker.respond_channel_message(thread)
if self.worker_mail is None:
logger.warning("Not sending mail notifier")
else:
self.worker_mail.send(article)

View File

@ -56,6 +56,7 @@ def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
if len(messages) > 5:
@ -74,6 +75,7 @@ def send_reaction_to_slack_thread(article, reaction):
)
print("Sent reaction to message")
def prompt_language(query):
not_set = True
while not_set:
@ -132,27 +134,6 @@ def accept_article(article, last_accepted):
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "white_check_mark")
"""linked = None
try:
thread = message_models.Thread.get(id = last_accepted.id + 1)
rel = message_models.get_referenced_articles(thread, article_models.ArticleDownload)
assert len(rel) == 1 and rel[0] == article
linked = thread
except: # if the above, naive method (just increment by one), fails, resort to brute search.
print("Bruteforcing search")
for t in message_models.Thread.select():
rel = message_models.get_referenced_articles(t, article_models.ArticleDownload)
if len(rel) == 1 and rel[0] == article:
linked = t
break
if linked:
linked.initiator_message.is_processed_override = 1
linked.initiator_message.save()
print("Message overwritten to PROCESSED")
else:
print("No matching thread found")"""
return "" # linked
@ -176,10 +157,13 @@ def verify_unchecked():
core_info.append(entry)
try:
# close any previously opened windows:
subprocess.call("killall evince")
# then open a new one
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings
except Exception as e:
print(str(list((-1, e))))
print(e)
continue
@ -220,66 +204,3 @@ def verify_unchecked():
proceed = True
else:
print("Invalid input")
# def verify_bad():
# b_options = {
# "ENTER":"Accept pdf as fixed",
# "B": "Keep pdf in BAD.",
# "R" : "set related files (prompted multiple times)",
# "C" : "Change the saved file-name and set as verified."
# }
# query = article_models.ArticleDownload.select().where(article_models.ArticleDownload.verified == -1).execute()
# for q in query:
# pdf = q.file_name
# save_dir = get_save_path(q)
# fname = save_dir + "BAD/" + pdf
# try:
# subprocess.call(["xdg-open", fname])
# except:
# print(f"[{testvar}██{testvar}] PDF moved:")
# print(fname)
# continue
# status_pdf = f"{testvar}██{testvar}"
# if "just a moment" in pdf:
# status_pdf = f"{testvar}██{testvar}"
# language = q.language
# status_language = f"{testvar}██{testvar}"
# if len(language) == 0:
# status_language = f"{testvar}██{testvar}"
# print_status_options(
# status=u_status.format(
# url = q.article_url,
# status_pdf = status_pdf,
# pdf = pdf[:80],
# status_language = status_language,
# language = language
# ),
# options = b_options)
# proceed = False
# while not proceed:
# proceed = False
# uin = input("Choice? ").lower()
# if uin == "":
# unreject_article(q)
# proceed = True
# elif uin == "b":
# proceed = True
# elif uin == "r":
# prompt_related(q)
# elif uin == "c":
# prompt_new_fname(q)
# proceed = True
# else:
# print("Invalid input")

View File

@ -210,7 +210,7 @@ def message_dict_to_model(message):
channel_id = config["archive_id"],
text = message["text"]
)
logger.info("Saved (text) {} (new={})".format(m, new))
logger.info(f"Saved: {m} ({'new' if new else 'old'})")
files = message.get("files", [])
if len(files) >= 1:
@ -218,7 +218,7 @@ def message_dict_to_model(message):
m.file_type = f["filetype"]
m.perma_link = f["url_private_download"]
m.save()
logger.info("Saved permalink {} to {}".format(f["name"], m))
logger.info(f"Saved {m.file_type}-file for message (id={m.id})")
if new:
return m
else:

View File

@ -78,7 +78,7 @@ class BotApp(App):
def incoming_channel_message(self, message):
self.logger.info("Handling message with {} url(s)".format(len(message.urls)))
self.logger.info(f"Handling message {message} ({len(message.urls)} urls)")
if not message.urls: # no urls in a root-message => IGNORE
message.is_processed_override = True

View File

@ -45,7 +45,7 @@ class ArticleDownload(DownloadBaseModel):
# ... are added through foreignkeys
def __str__(self) -> str:
return "ART ({} -- {})".format(self.title, self.source_name)
return f"ART [{self.title} -- {self.source_name}]"
## Useful Properties
@property
@ -248,7 +248,7 @@ class Message(ChatBaseModel):
# reaction
def __str__(self) -> str:
return "MSG ({} -- {})".format(self.channel_id, self.text[:min(len(self.text), 50)].replace("\n","/") + "....")
return "MSG [{}]".format(self.text[:min(len(self.text), 30)].replace('\n','/') + '...')
@property
def slack_ts(self):
@ -312,18 +312,4 @@ def clear_path_name(path):
keepcharacters = (' ','.','_', '-')
converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
return converted
# return re.sub(r'[^\x00-\x7f]', r'_', path)
# # cleared = path.replace("\n"," ")\
# # .replace("|", "_")\
# # .replace(":", "_")\
# # .replace("?", "_")\
# # .replace("!", "_")\
# # .replace(",", "_")\
# # .replace("/", "_")\
# # .replace("\\", "_")\
# # .replace("*", "")\
# # .replace("\"", "'")\
# # .replace("<", "'")\
# # .replace(">", "'")
# # return cleared

View File

@ -1,5 +1,7 @@
import os
import subprocess
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
import configuration
@ -8,26 +10,35 @@ config = configuration.parsed["DOWNLOADS"]
shrink_sizes = []
def shrink_pdf(article):
initial_size = os.path.getsize(article.save_path + article.file_name)
if article.file_name[-4:] != ".pdf":
article_loc = Path(article.save_path) / article.file_name
initial_size = article_loc.stat().st_size
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
if article_loc.suffix != "pdf":
return article # it probably was a youtube video
c = subprocess.run(
["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f"{article.save_path + article.file_name}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
[
"gs",
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/screen",
"-dNOPAUSE",
"-dBATCH",
f"-sOutputFile={compressed_tmp}",
f"{article_loc}"
],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if c.returncode == 0:
m = subprocess.run(
["mv", "-f", f"{config['default_download_path']}/compressed.pdf", article.save_path + article.file_name]
)
if m.returncode == 0:
final_size = os.path.getsize(article.save_path + article.file_name)
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {sum(shrink_sizes)/len(shrink_sizes) / 1000} (kb)")
return article # even though no modifications were made
else:
logger.error(f"Compression ran but I could not copy back the file {m.stderr.decode()} - {m.stdout.decode()}")
try:
os.replace(compressed_tmp, article_loc)
except OSError as e:
logger.error(f"Compression ran but I could not copy back the file {e}")
final_size = article_loc.stat().st_size
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
else:

View File

@ -2,6 +2,7 @@ import time
import datetime
import logging
import os
import sys
import base64
import requests
from selenium import webdriver
@ -21,18 +22,19 @@ class PDFDownloader:
def start(self):
options=Options()
options.profile = config["browser_profile_path"]
# TODO: Get headless mode interactively
options.add_argument('--headless')
# options.add_argument("--disable-infobars")
# options.set_preference("javascript.enabled", False)
# options.add_argument("--disable-popup-blocking")
if "notheadless" in sys.argv:
self.logger.warning("Opening browser GUI because of Argument 'notheadless'")
else:
options.add_argument('--headless')
# Print to pdf
options.set_preference("print_printer", "Mozilla Save to PDF")
options.set_preference("print.always_print_silent", True)
options.set_preference("print.show_print_progress", False)
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already, does not work!
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
# Save existing pdf
options.set_preference("browser.download.folderList", 2)
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# options.set_preference("pdfjs.disabled", True)
@ -140,7 +142,7 @@ class PDFDownloader:
hrefs = [h for h in hrefs \
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
] # filter a tiny bit at least
self.logger.info(f"Hrefs result (before:{len_old}, after: {len(hrefs)})")
self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
return hrefs

View File

@ -11,61 +11,52 @@ logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
logger = logging.getLogger("fetch")
class NewspaperDummy():
title = "Error while running fetch"
summary = "Error while running fetch"
text = "Error while running fetch"
meta_lang = ""
authors = []
keywords = []
def get_description(article_object):
url = article_object.article_url
website = urlparse(url).netloc
article_object.source_name = website
try:
pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
except: # other file types
pub_date = datetime.datetime(year=1900, month=1, day=1)
article_object.pub_date = pub_date
article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
fallback = NewspaperDummy()
try:
news_article = Article(url)
news_article.download()
news_article.parse()
except:
news_article = fallback
if news_article.title:
title = news_article.title
else:
title = fallback.title
if news_article.summary:
summary = news_article.summary
elif news_article.text:
ind = min(500, len(news_article.text))
summary = news_article.text[:ind] + "..."
else:
summary = fallback.summary
news_article = object() # fallback value
try:
print(f"lang: {news_article.meta_lang}")
except:
print("could not access meta_lang")
article_object.title = news_article.title
except AttributeError:
article_object.title = "Error while running fetch"
if news_article.meta_lang:
lang = news_article.meta_lang
else:
lang = ""
try:
if article_object.summary:
article_object.summary = news_article.summary
elif news_article.text:
ind = min(500, len(news_article.text))
article_object.summary = news_article.text[:ind] + "..."
else:
article_object.summary = ""
except AttributeError:
article_object.summary = ""
article_object.title = title
article_object.summary = summary
article_object.language = lang
article_object.set_authors(news_article.authors)
article_object.set_keywords(news_article.keywords)
try:
article_object.language = news_article.meta_lang
except AttributeError:
article_object.language = ""
try:
article_object.set_authors(news_article.authors)
except AttributeError:
pass # list would have been empty anyway
try:
article_object.set_keywords(news_article.keywords)
except AttributeError:
pass # list would have been empty anyway
return article_object

View File

@ -28,7 +28,7 @@ class TemplateWorker(Thread):
time.sleep(5)
else:
article_watcher = self._article_queue.pop(0)
self.logger.info(f"{self.__class__.__name__} is now processing article ({len(self._article_queue)} in queue)")
self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}")
self._handle_article(article_watcher)

View File

@ -2,7 +2,7 @@ import os
import re
import json
os.chdir("/home/remy/Documents/mails2/")
os.chdir("/home/remy/Downloads/mails/")
regex = "(?P<url>https?://[^\s]+)"

View File

@ -0,0 +1,40 @@
from cmath import log
from concurrent.futures import thread
import sys
sys.path.append("../app")
import runner
import logging
logger = logging.getLogger()
import json
logger.info("Overwriting production values for single use media-fetch")
runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("media_message_dummy.db"), # chat_db (not needed here)
runner.configuration.SqliteDatabase("media_downloads.db")
)
runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "."
coordinator = runner.Coordinator()
kwargs = {
"worker_download" : runner.DownloadWorker(),
"worker_fetch" : runner.FetchWorker(),
"worker_upload" : runner.UploadWorker(),
"worker_compress" : runner.CompressWorker(),
}
coordinator.add_workers(**kwargs)
coordinator.start()
with open("media_urls.json", "r") as f:
url_list = json.loads(f.read())
logger.info(f"Found {len(url_list)} media urls")
for u in url_list:
msg_text = f"<{u}|dummy preview text>"
dummy_thread = runner.models.Thread()
msg = runner.models.Message(text= msg_text, thread=dummy_thread)
coordinator.incoming_request(msg)

1
misc/media_urls.json Normal file
View File

@ -0,0 +1 @@
["https://www.nesta.org.uk/report/digital-democracy-the-tools-transforming-political-engagement/", "https://media.nesta.org.uk/documents/digital_democracy.pdf", "https://context-cdn.washingtonpost.com/notes/prod/default/documents/c3c41863-be9e-4246-9ed9-e43aedd013f9/note/4e677597-f403-4c9b-b838-f5613d79b341", "https://context-cdn.washingtonpost.com/notes/prod/default/documents/6d274110-a84b-4694-96cd-6a902207d2bd/note/733364cf-0afb-412d-a5b4-ab797a8ba154.#page=1", "https://www.judiciary.senate.gov/fisa-investigation", "https://www.state.gov/fact-sheet-activity-at-the-wuhan-institute-of-virology/", "https://www.whitehouse.gov/trump-administration-accomplishments/", "https://www.whitehouse.gov/wp-content/uploads/2021/01/IPS-Final-Declass.pdf", "https://www.finance.senate.gov/imo/media/doc/Oversight,%2012-23-20,%20Memo%20on%20World%20Vision%20Investigation.pdf", "https://justthenews.com/sites/default/files/2020-12/BidenArcher4-13-14.pdf", "https://www.hsgac.senate.gov/imo/media/doc/Johnson-Grassley%20Submission%202020-12-09.pdf", "https://navarroreport.com/", "https://got-freedom.org/wp-content/uploads/2020/12/HAVA-and-Non-Profit-Organization-Report-FINAL-W-Attachments-and-Preface-121420.pdf", "https://www.depernolaw.com/uploads/2/7/0/2/27029178/antrim_michigan_forensics_report_%5B121320%5D_v2_%5Bredacted%5D.pdf", "https://www.hsgac.senate.gov/imo/media/doc/HSGAC_Finance_Report_FINAL.pdf", "https://www.scribd.com/document/487040771/Emails-About-FBI-Receipt-Fusion-GPS-Thumb-Drive", "https://cdn.epoch.cloud/assets/static_assets/Voter-Fraud-Allegations-Infographic-Epoch-Times.jpg", "https://www.hsgac.senate.gov/imo/media/doc/Lync%20and%20text%20messages%20between%20and%20among%20DOJ%20and%20FBI%20employees.pdf", "https://www.hsgac.senate.gov/imo/media/doc/DOJ%20Docs%20Combined.pdf", "https://www.hsgac.senate.gov/imo/media/doc/FBI%20Productions%20Combined%20-%20updated_FINAL.pdf", "https://www.hsgac.senate.gov/imo/media/doc/STATE_combined.pdf", "https://cdn.epoch.cloud/assets/static_assets/epochtimes-infographic-war-on-president-trump.jpg", "https://centipedenation.com/transmissions/miles-guo-dropping-bombs-hunter-biden-sex-tapes-and-other-evidence-of-the-ccps-infiltration-of-the-u-s/", "https://www.finance.senate.gov/imo/media/doc/2020-11-18%20HSGAC%20-%20Finance%20Joint%20Report%20Supplemental.pdf", "https://www.scribd.com/document/479781400/Steele-Spreadsheet-1", "https://www.zerohedge.com/political/jim-comey-ignored-state-department-whistleblower-hillarys-crimes-classified-material", "https://www.judicialwatch.org/wp-content/uploads/2020/10/JW-v-State-Steele-Oct-2020-prod-00968.pdf", "https://justthenews.com/sites/default/files/2020-10/requested%20email.pdf", "https://www.tagblatt.ch/kultur/sommertipps-20-buchempfehlungen-fuer-prominente-wir-haben-die-besten-buecher-fuer-jeden-charakter-zusammengetragen-ld.2159339", "https://www.tagblatt.ch/kultur/sommertipps-20-buchempfehlungen-fuer-prominente-wir-haben-die-besten-buecher-fuer-jeden-charakter-zusammengetragen-ld.2159339", "https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/", "https://www.nature.com/articles/news.2010.351", "https://www.focus.de/panorama/welt/tid-19265/gastkommentar-nutzt-die-moeglichkeiten-des-computers_aid_534372.html", "http://www.ccss.ethz.ch/Response/index.html", "https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/", "https://www.say.media/article/la-mort-par-algorithme", "https://www.say.media/article/la-mort-par-algorithme", "https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761", "https://www.theglobalist.com/democracy-technology-innovation-society-internet/","https://www.theglobalist.com/capitalism-democracy-technology-surveillance-privacy/","https://www.theglobalist.com/google-artificial-intelligence-big-data-technology-future/","https://www.theglobalist.com/fascism-big-data-artificial-intelligence-surveillance-democracy/","https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/","https://www.theglobalist.com/technology-society-sustainability-future-humanity/","https://www.theglobalist.com/society-technology-peace-sustainability/","https://www.theglobalist.com/democracy-technology-social-media-artificial-intelligence/","https://www.theglobalist.com/financial-system-reform-economy-internet-of-things-capitalism/","https://www.theglobalist.com/capitalism-society-equality-sustainability-crowd-funding/","https://www.theglobalist.com/united-nations-world-government-peace-sustainability-society/","https://www.theglobalist.com/world-economy-sustainability-environment-society/"]