diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index de765f8d..e74e668e 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -3,6 +3,10 @@ import time import requests import hashlib import os +import re +import html2text +from urlextract import URLExtract + # Hmm Polymorphism datastore, thread, etc class perform_site_check(Thread): @@ -53,17 +57,30 @@ class perform_site_check(Thread): extra_headers = self.datastore.get_val(self.uuid, 'headers') headers.update(extra_headers) - print (headers) - - print("Checking", self.url) - import html2text + self.ensure_output_path() try: r = requests.get(self.url, headers=headers, timeout=15, verify=False) stripped_text_from_html = html2text.html2text(r.content.decode('utf-8')) + # @todo This should be a config option. + # Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes.. + + extractor = URLExtract() + urls = extractor.find_urls(stripped_text_from_html) + # Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones. + if urls: + urls.sort(key=len, reverse=True) + + for url in urls: + # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text. + if "://" in url: + #print ("Stripping link", url) + stripped_text_from_html = stripped_text_from_html.replace(url, '') + + # Usually from networkIO/requests level except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: