From 9c0c8bf6aa791b66d355a05cc7bb07f4af87a257 Mon Sep 17 00:00:00 2001 From: Leigh Morresi <275001+dgtlmoon@users.noreply.github.com> Date: Thu, 28 Jan 2021 14:45:01 +0100 Subject: [PATCH] Remove actual :// links, dont consider these as part of the changes, often they include variables/trackingscript ref etc --- backend/fetch_site_status.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index de765f8d..e74e668e 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -3,6 +3,10 @@ import time import requests import hashlib import os +import re +import html2text +from urlextract import URLExtract + # Hmm Polymorphism datastore, thread, etc class perform_site_check(Thread): @@ -53,17 +57,30 @@ class perform_site_check(Thread): extra_headers = self.datastore.get_val(self.uuid, 'headers') headers.update(extra_headers) - print (headers) - - print("Checking", self.url) - import html2text + self.ensure_output_path() try: r = requests.get(self.url, headers=headers, timeout=15, verify=False) stripped_text_from_html = html2text.html2text(r.content.decode('utf-8')) + # @todo This should be a config option. + # Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes.. + + extractor = URLExtract() + urls = extractor.find_urls(stripped_text_from_html) + # Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones. + if urls: + urls.sort(key=len, reverse=True) + + for url in urls: + # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text. + if "://" in url: + #print ("Stripping link", url) + stripped_text_from_html = stripped_text_from_html.replace(url, '') + + # Usually from networkIO/requests level except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: