63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
from newspaper import Article
|
|
from urllib.parse import urlparse
|
|
from htmldate import find_date
|
|
import datetime
|
|
import logging
|
|
logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
|
|
logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
|
|
logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
|
|
logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
|
|
logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
|
|
logger = logging.getLogger("fetch")
|
|
|
|
|
|
def get_description(article_object):
|
|
url = article_object.article_url
|
|
website = urlparse(url).netloc
|
|
article_object.source_name = website
|
|
|
|
try:
|
|
article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
|
|
except: # other file types
|
|
article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
|
|
|
|
try:
|
|
news_article = Article(url)
|
|
news_article.download()
|
|
news_article.parse()
|
|
except:
|
|
news_article = object() # fallback value
|
|
|
|
try:
|
|
article_object.title = news_article.title
|
|
except AttributeError:
|
|
article_object.title = "Error while running fetch"
|
|
|
|
try:
|
|
if article_object.summary:
|
|
article_object.summary = news_article.summary
|
|
elif news_article.text:
|
|
ind = min(500, len(news_article.text))
|
|
article_object.summary = news_article.text[:ind] + "..."
|
|
else:
|
|
article_object.summary = ""
|
|
except AttributeError:
|
|
article_object.summary = ""
|
|
|
|
try:
|
|
article_object.language = news_article.meta_lang
|
|
except AttributeError:
|
|
article_object.language = ""
|
|
|
|
try:
|
|
article_object.set_authors(news_article.authors)
|
|
except AttributeError:
|
|
pass # list would have been empty anyway
|
|
|
|
try:
|
|
article_object.set_keywords(news_article.keywords)
|
|
except AttributeError:
|
|
pass # list would have been empty anyway
|
|
|
|
return article_object
|