From 9c0c8bf6aa791b66d355a05cc7bb07f4af87a257 Mon Sep 17 00:00:00 2001
From: Leigh Morresi <275001+dgtlmoon@users.noreply.github.com>
Date: Thu, 28 Jan 2021 14:45:01 +0100
Subject: [PATCH] Remove actual :// links, dont consider these as part of the
 changes, often they include variables/trackingscript ref etc

---
 backend/fetch_site_status.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py
index de765f8d..e74e668e 100644
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@@ -3,6 +3,10 @@ import time
 import requests
 import hashlib
 import os
+import re
+import html2text
+from urlextract import URLExtract
+
 
 # Hmm Polymorphism datastore, thread, etc
 class perform_site_check(Thread):
@@ -53,17 +57,30 @@ class perform_site_check(Thread):
         extra_headers = self.datastore.get_val(self.uuid, 'headers')
         headers.update(extra_headers)
 
-        print (headers)
-
-
         print("Checking", self.url)
-        import html2text
+
         self.ensure_output_path()
 
         try:
             r = requests.get(self.url, headers=headers, timeout=15, verify=False)
             stripped_text_from_html = html2text.html2text(r.content.decode('utf-8'))
 
+            # @todo This should be a config option.
+            # Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
+
+            extractor = URLExtract()
+            urls = extractor.find_urls(stripped_text_from_html)
+            # Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
+            if urls:
+                urls.sort(key=len, reverse=True)
+
+                for url in urls:
+                    # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
+                    if "://" in url:
+                        #print ("Stripping link", url)
+                        stripped_text_from_html = stripped_text_from_html.replace(url, '')
+
+
 
         # Usually from networkIO/requests level
         except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: