Fehler ohne KI und debugging gefunden...
This commit is contained in:
parent
ada8c4ae48
commit
f0ebdc7888
46
app.py
46
app.py
@ -31,48 +31,30 @@ def get_text_from_email(email_message):
|
|||||||
text += body.decode(errors="ignore")
|
text += body.decode(errors="ignore")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def find_unsubscribe_links(text):
|
def find_unsubscribe_links(text, html_links):
|
||||||
links = re.finditer(link_pattern, text)
|
soup = BeautifulSoup(text, 'html.parser')
|
||||||
unsubscribe_links = []
|
unsubscribe_links = []
|
||||||
for link in links:
|
|
||||||
link_url = link.group(1) # Die URL des Links befindet sich in der Gruppe 1
|
|
||||||
link_text = link.group(2) # Der Text des Links befindet sich in der Gruppe 2
|
|
||||||
|
|
||||||
# Überprüfen, ob der Link-Text oder die URL auf das Unsubscribe-Pattern passen
|
|
||||||
if re.search(unsubscribe_pattern, link_text) or re.search(unsubscribe_pattern, link_url):
|
|
||||||
unsubscribe_links.append(link.group(0))
|
|
||||||
return unsubscribe_links
|
|
||||||
|
|
||||||
def find_unsubscribe_links_in_text(text, html_links):
|
for link in soup.find_all('a', href=True):
|
||||||
soup = BeautifulSoup(text, 'html.parser', multi_valued_attributes=None)
|
if re.search(unsubscribe_pattern, str(link)) and str(link['href']) not in html_links:
|
||||||
unsubscribe_links = []
|
print(link['href'])
|
||||||
|
test1=str(link['href'])
|
||||||
|
print(test1)
|
||||||
|
if test1 not in unsubscribe_links:
|
||||||
|
print(test1)
|
||||||
|
unsubscribe_links.append(test1)
|
||||||
|
|
||||||
|
|
||||||
# Zusätzlich nach einfachen URLs suchen und auf das Unsubscribe-Pattern prüfen
|
# Zusätzlich nach einfachen URLs suchen und auf das Unsubscribe-Pattern prüfen
|
||||||
simple_urls = re.findall(r"(https?://\S+)", text)
|
simple_urls = re.findall(r"(https?://\S+)", text)
|
||||||
for url in simple_urls:
|
for url in simple_urls:
|
||||||
if re.search(unsubscribe_pattern, url) and url not in html_links:
|
if re.search(unsubscribe_pattern, url) and url not in html_links:
|
||||||
|
if url not in unsubscribe_links:
|
||||||
unsubscribe_links.append(url)
|
unsubscribe_links.append(url)
|
||||||
|
|
||||||
for link in soup.find_all('a', href=True):
|
|
||||||
link_url = str(link.get('href'))
|
|
||||||
# Überprüfen, ob der Link-URL oder der Link-Text auf das Unsubscribe-Pattern passt und nicht in den HTML-Links enthalten ist
|
|
||||||
if re.search(unsubscribe_pattern, str(link)) and link_url not in html_links:
|
|
||||||
unsubscribe_links.append(link_url)
|
|
||||||
|
|
||||||
return unsubscribe_links
|
return unsubscribe_links
|
||||||
|
|
||||||
def find_links_in_text(text):
|
|
||||||
link_pattern = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'
|
|
||||||
links = re.findall(link_pattern, text)
|
|
||||||
return links
|
|
||||||
|
|
||||||
|
|
||||||
def find_all_links_in_text(text):
|
|
||||||
links = []
|
|
||||||
for match in re.findall(link_pattern, text):
|
|
||||||
links.append(match)
|
|
||||||
return links
|
|
||||||
|
|
||||||
def get_subject_from_email(email_message):
|
def get_subject_from_email(email_message):
|
||||||
subject = email_message.get("Subject")
|
subject = email_message.get("Subject")
|
||||||
return subject
|
return subject
|
||||||
@ -122,9 +104,9 @@ def result():
|
|||||||
body = get_text_from_email(msg)
|
body = get_text_from_email(msg)
|
||||||
subject = get_subject_from_email(msg)
|
subject = get_subject_from_email(msg)
|
||||||
if body:
|
if body:
|
||||||
unsubscribe_links_in_html = find_unsubscribe_links(body)
|
unsubscribe_links_in_html = find_unsubscribe_links(body, html_links)
|
||||||
html_links.update(unsubscribe_links_in_html)
|
html_links.update(unsubscribe_links_in_html)
|
||||||
unsubscribe_links_in_text = find_unsubscribe_links_in_text(body, html_links)
|
unsubscribe_links_in_text = find_unsubscribe_links(body, html_links)
|
||||||
all_unsubscribe_links = list(set(unsubscribe_links_in_html + unsubscribe_links_in_text))
|
all_unsubscribe_links = list(set(unsubscribe_links_in_html + unsubscribe_links_in_text))
|
||||||
if all_unsubscribe_links:
|
if all_unsubscribe_links:
|
||||||
links_data[message_num.decode()] = {
|
links_data[message_num.decode()] = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user