from newspaper import Article from urllib.parse import urlparse from htmldate import find_date import datetime import logging logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs logger = logging.getLogger("fetch") class NewspaperDummy(): title = "Error while running fetch" summary = "Error while running fetch" text = "Error while running fetch" authors = [] keywords = [] def get_description(article_object): url = article_object.article_url website = urlparse(url).netloc article_object.source_name = website try: pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M') except: # other file types pub_date = datetime.datetime(year=1900, month=1, day=1) article_object.pub_date = pub_date fallback = NewspaperDummy() try: news_article = Article(url) news_article.download() news_article.parse() except: news_article = fallback if news_article.title: title = news_article.title else: title = fallback.title if news_article.summary: summary = news_article.summary elif news_article.text: ind = min(500, len(news_article.text)) summary = news_article.text[:ind] + "..." else: summary = fallback.summary article_object.title = title article_object.summary = summary article_object.set_authors(news_article.authors) article_object.set_keywords(news_article.keywords) return article_object