26 lines
603 B
Python
26 lines
603 B
Python
"""
|
|
Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
|
|
"""
|
|
import os
|
|
import re
|
|
import json
|
|
|
|
os.chdir("/home/remy/Downloads/mails/")
|
|
|
|
regex = "(?P<url>https?://[^\s]+)"
|
|
|
|
all_files = os.listdir(".")
|
|
all_urls = []
|
|
|
|
for f in all_files:
|
|
with open(f, "r", encoding="utf8") as mail:
|
|
content = mail.readlines()
|
|
|
|
search = "".join(content)
|
|
urls = re.findall(regex, search)
|
|
all_urls += urls
|
|
|
|
print("Saved {} urls".format(len(all_urls)))
|
|
|
|
with open("mails_url_export.json", "w") as f:
|
|
json.dump(all_urls, f) |