diff --git a/news_fetch/requirements.txt b/news_fetch/requirements.txt index 6d4b765..29bab81 100644 --- a/news_fetch/requirements.txt +++ b/news_fetch/requirements.txt @@ -8,4 +8,5 @@ newspaper3k htmldate markdown rich -psycopg2 \ No newline at end of file +psycopg2 +unidecode diff --git a/news_fetch/utils_storage/helpers.py b/news_fetch/utils_storage/helpers.py index b15592e..24e3b6f 100644 --- a/news_fetch/utils_storage/helpers.py +++ b/news_fetch/utils_storage/helpers.py @@ -1,7 +1,11 @@ +import unidecode +KEEPCHARACTERS = (' ','.','_', '-') + def clear_path_name(path): - keepcharacters = (' ','.','_', '-') - converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip() - return converted + path = unidecode.unidecode(path) # remove umlauts, accents and others + path = "".join([c if (c.isalnum() or c in KEEPCHARACTERS) else "_" for c in path]) # remove all non-alphanumeric characters + path = path.rstrip() # remove trailing spaces + return path def shorten_name(name, offset = 50): if len(name) > offset: