Compare commits

...

6 Commits

Author SHA1 Message Date
99ec2b956b modified: app.py 2023-08-02 04:42:13 +02:00
f0ebdc7888 Fehler ohne KI und debugging gefunden... 2023-08-02 04:01:38 +02:00
ada8c4ae48 What have I done? not completed one stupid href 2023-08-02 03:13:37 +02:00
3765d30822 modified: .gitignore 2023-08-02 00:49:02 +02:00
bd849025bf new file: .env.example 2023-08-02 00:09:47 +02:00
9d222723b6 new file: .gitignore 2023-08-02 00:07:31 +02:00
5 changed files with 38 additions and 25 deletions

3
.env.example Normal file
View File

@ -0,0 +1,3 @@
EMAIL_USERNAME=example@example.com
EMAIL_PASSWORD=example
EMAIL_SERVER=example.com

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env
*.eml

55
app.py
View File

@ -4,6 +4,7 @@ import re
import os import os
from flask import Flask, render_template, request, redirect, url_for from flask import Flask, render_template, request, redirect, url_for
from dotenv import load_dotenv from dotenv import load_dotenv
from bs4 import BeautifulSoup
app = Flask(__name__) app = Flask(__name__)
load_dotenv() # Laden der Umgebungsvariablen aus der .env-Datei load_dotenv() # Laden der Umgebungsvariablen aus der .env-Datei
@ -15,6 +16,8 @@ EMAIL_SERVER = os.environ.get("EMAIL_SERVER")
template_dir = os.path.abspath(os.path.dirname(__file__)) template_dir = os.path.abspath(os.path.dirname(__file__))
app = Flask(__name__, template_folder=template_dir) app = Flask(__name__, template_folder=template_dir)
link_pattern = r"<a href=\"(.*?)\">(.*?)</a>"
unsubscribe_pattern = r"(?i)\babbestellen\b|\bunsubscribe\b|\bdeabonnieren\b|\babbestellung\b|\babmelden\b"
def get_text_from_email(email_message): def get_text_from_email(email_message):
text = "" text = ""
@ -28,29 +31,29 @@ def get_text_from_email(email_message):
text += body.decode(errors="ignore") text += body.decode(errors="ignore")
return text return text
def find_unsubscribe_links(text): def find_unsubscribe_links(text, html_links):
# Erweitere die Liste der Abmeldelinks-Patterns bei Bedarf soup = BeautifulSoup(text, 'html.parser')
unsubscribe_patterns = [
r"(?i)\babbestellen\b", # Deutsch - abbestellen
r"(?i)\babbestellung\b", # Deutsch - abbestellung
r"(?i)\bdeabonnieren\b", # Deutsch - deabonnieren
r"(?i)\bunsubscribe\b", # Englisch - unsubscribe
]
unsubscribe_links = [] unsubscribe_links = []
for pattern in unsubscribe_patterns:
matches = re.findall(pattern, text)
if matches:
for match in matches:
link = find_link_in_text(text)
if link:
unsubscribe_links.append(link)
return unsubscribe_links
def find_link_in_text(text):
# Einen einfachen Ansatz zur Link-Erkennung in Texten (nicht perfekt) for link in soup.find_all('a', href=True):
link_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" if re.search(unsubscribe_pattern, str(link)) and str(link['href']) not in html_links:
link_matches = re.findall(link_pattern, text) print(link['href'])
return link_matches[0] if link_matches else None test1=str(link['href'])
print(test1)
if test1 not in unsubscribe_links:
print(test1)
unsubscribe_links.append(test1)
# Zusätzlich nach einfachen URLs suchen und auf das Unsubscribe-Pattern prüfen
simple_urls = re.findall(r"(https?://\S+)", text)
for url in simple_urls:
if re.search(unsubscribe_pattern, url) and url not in html_links:
if url not in unsubscribe_links:
unsubscribe_links.append(url)
return unsubscribe_links
def get_subject_from_email(email_message): def get_subject_from_email(email_message):
subject = email_message.get("Subject") subject = email_message.get("Subject")
@ -93,6 +96,7 @@ def result():
_, messages = mail.search(None, "ALL") _, messages = mail.search(None, "ALL")
links_data = {} links_data = {}
html_links = set()
for message_num in messages[0].split(): for message_num in messages[0].split():
_, msg_data = mail.fetch(message_num, "(RFC822)") _, msg_data = mail.fetch(message_num, "(RFC822)")
@ -100,11 +104,14 @@ def result():
body = get_text_from_email(msg) body = get_text_from_email(msg)
subject = get_subject_from_email(msg) subject = get_subject_from_email(msg)
if body: if body:
unsubscribe_links = find_unsubscribe_links(body) unsubscribe_links_in_html = find_unsubscribe_links(body, html_links)
if unsubscribe_links: html_links.update(unsubscribe_links_in_html)
unsubscribe_links_in_text = find_unsubscribe_links(body, html_links)
all_unsubscribe_links = list(set(unsubscribe_links_in_html + unsubscribe_links_in_text))
if all_unsubscribe_links:
links_data[message_num.decode()] = { links_data[message_num.decode()] = {
"subject": subject, "subject": subject,
"links": list(set(unsubscribe_links)) "links": all_unsubscribe_links
} }
mail.logout() mail.logout()

View File

@ -1,2 +1,3 @@
Flask Flask
python-dotenv python-dotenv
beautifulsoup4

View File

@ -9,7 +9,7 @@
<h1>E-Mail Unsubscribe Links Result</h1> <h1>E-Mail Unsubscribe Links Result</h1>
<table> <table>
<tr> <tr>
<th>Email Address</th> <th>Email ID</th>
<th>Subject</th> <th>Subject</th>
<th>Unsubscribe Link</th> <th>Unsubscribe Link</th>
<th>Move to Trash</th> <th>Move to Trash</th>