2022-04-17 22:32:56 +02:00

61 lines
1.8 KiB
Python

from newspaper import Article
from urllib.parse import urlparse
from htmldate import find_date
import datetime
import logging
logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
logger = logging.getLogger("fetch")
class NewspaperDummy():
title = "Error while running fetch"
summary = "Error while running fetch"
text = "Error while running fetch"
authors = []
keywords = []
def get_description(article_object):
url = article_object.article_url
website = urlparse(url).netloc
article_object.source_name = website
try:
pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
except: # other file types
pub_date = datetime.datetime(year=1900, month=1, day=1)
article_object.pub_date = pub_date
fallback = NewspaperDummy()
try:
news_article = Article(url)
news_article.download()
news_article.parse()
except:
news_article = fallback
if news_article.title:
title = news_article.title
else:
title = fallback.title
if news_article.summary:
summary = news_article.summary
elif news_article.text:
ind = min(500, len(news_article.text))
summary = news_article.text[:ind] + "..."
else:
summary = fallback.summary
article_object.title = title
article_object.summary = summary
article_object.set_authors(news_article.authors)
article_object.set_keywords(news_article.keywords)
return article_object