coss_archiving/manual/exctract_from_mail_backup.py

26 lines
603 B
Python

"""
Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
"""
import os
import re
import json
os.chdir("/home/remy/Downloads/mails/")
regex = "(?P<url>https?://[^\s]+)"
all_files = os.listdir(".")
all_urls = []
for f in all_files:
with open(f, "r", encoding="utf8") as mail:
content = mail.readlines()
search = "".join(content)
urls = re.findall(regex, search)
all_urls += urls
print("Saved {} urls".format(len(all_urls)))
with open("mails_url_export.json", "w") as f:
json.dump(all_urls, f)