diff --git a/README.md b/README.md index e7acaf9f..62cbdbf4 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,14 @@ Free, Open-source web page monitoring, notification and change detection. Don't - COVID related news from government websites - University/organisation news from their website - Detect and monitor changes in JSON API responses -- API monitoring and alerting +- JSON API monitoring and alerting - Changes in legal and other documents - Trigger API calls via notifications when text appears on a website - Glue together APIs using the JSON filter and JSON notifications - Create RSS feeds based on changes in web content +- Monitor HTML source code for unexpected changes, strengthen your PCI compliance - You have a very sensitive list of URLs to watch and you do _not_ want to use the paid alternatives. (Remember, _you_ are the product) - + _Need an actual Chrome runner with Javascript support? We support fetching via WebDriver!_ ## Screenshots diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index b26d17fc..ff31bccd 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -708,7 +708,7 @@ def changedetection_app(config=None, datastore_o=None): url = url.strip() url, *tags = url.split(" ") # Flask wtform validators wont work with basic auth, use validators package - if len(url) and validators.url(url): + if len(url) and validators.url(url.replace('source:', '')): new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags)) # Straight into the queue. update_q.put(new_uuid) diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 9a783faf..7fd86611 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -20,7 +20,7 @@ class perform_site_check(): timestamp = int(time.time()) # used for storage etc too changed_detected = False - screenshot = False # as bytes + screenshot = False # as bytes stripped_text_from_html = "" watch = self.datastore.data['watching'][uuid] @@ -52,6 +52,12 @@ class perform_site_check(): request_method = self.datastore.get_val(uuid, 'method') ignore_status_code = self.datastore.get_val(uuid, 'ignore_status_codes') + # source: support + is_source = False + if url.startswith('source:'): + url = url.replace('source:', '') + is_source = True + # Pluggable content fetcher prefer_backend = watch['fetch_backend'] if hasattr(content_fetcher, prefer_backend): @@ -60,7 +66,6 @@ class perform_site_check(): # If the klass doesnt exist, just use a default klass = getattr(content_fetcher, "html_requests") - fetcher = klass() fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code) # Fetching complete, now filters @@ -75,6 +80,12 @@ class perform_site_check(): is_json = 'application/json' in fetcher.headers.get('Content-Type', '') is_html = not is_json + + # source: support, basically treat it as plaintext + if is_source: + is_html = False + is_json = False + css_filter_rule = watch['css_filter'] subtractive_selectors = watch.get( "subtractive_selectors", [] @@ -94,7 +105,7 @@ class perform_site_check(): stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) is_html = False - if is_html: + if is_html or is_source: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content @@ -113,15 +124,24 @@ class perform_site_check(): html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) - # extract text - stripped_text_from_html = \ - html_tools.html_to_text( - html_content, - render_anchor_tag_content=self.datastore.data["settings"][ - "application"].get( - "render_anchor_tag_content", False) - ) - + + if not is_source: + # extract text + stripped_text_from_html = \ + html_tools.html_to_text( + html_content, + render_anchor_tag_content=self.datastore.data["settings"][ + "application"].get( + "render_anchor_tag_content", False) + ) + + elif is_source: + stripped_text_from_html = html_content + + # Re #340 - return the content before the 'ignore text' was applied + text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') + + # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') @@ -161,13 +181,11 @@ class perform_site_check(): if result: blocked_by_not_found_trigger_text = False - if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5: changed_detected = True update_obj["previous_md5"] = fetched_md5 update_obj["last_changed"] = timestamp - # Extract title as title if is_html: if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']: @@ -179,4 +197,4 @@ class perform_site_check(): fetcher.quit() - return changed_detected, update_obj, text_content_before_ignored_filter, screenshot + return changed_detected, update_obj, text_content_before_ignored_filter, screenshot \ No newline at end of file diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 2e00b057..5bf725a7 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -51,7 +51,7 @@ Pause {{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}} - + {%if watch.fetch_backend == "html_webdriver" %}{% endif %} {% if watch.last_error is defined and watch.last_error != False %} diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py index c23d01ee..79143c4e 100644 --- a/changedetectionio/tests/test_backend.py +++ b/changedetectionio/tests/test_backend.py @@ -50,6 +50,14 @@ def test_check_basic_change_detection_functionality(client, live_server): ##################### + # Check HTML conversion detected and workd + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + # Check this class does not appear (that we didnt see the actual source) + assert b'foobar-detection' not in res.data + # Make a change set_modified_response() diff --git a/changedetectionio/tests/test_source.py b/changedetectionio/tests/test_source.py new file mode 100644 index 00000000..f3b153b2 --- /dev/null +++ b/changedetectionio/tests/test_source.py @@ -0,0 +1,95 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from urllib.request import urlopen +from .util import set_original_response, set_modified_response, live_server_setup + +sleep_time_for_fetch_thread = 3 + +def test_setup(live_server): + live_server_setup(live_server) + +def test_check_basic_change_detection_functionality_source(client, live_server): + set_original_response() + test_url = 'source:'+url_for('test_endpoint', _external=True) + # Add our URL to the import page + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + + ##################### + + # Check HTML conversion detected and workd + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + # Check this class DOES appear (that we didnt see the actual source) + assert b'foobar-detection' in res.data + + # Make a change + set_modified_response() + + # Force recheck + res = client.get(url_for("api_watch_checknow"), follow_redirects=True) + assert b'1 watches are queued for rechecking.' in res.data + + time.sleep(5) + + # Now something should be ready, indicated by having a 'unviewed' class + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + res = client.get( + url_for("diff_history_page", uuid="first"), + follow_redirects=True + ) + + assert b'<title>modified head title' in res.data + + + + +def test_check_ignore_elements(client, live_server): + set_original_response() + + time.sleep(2) + test_url = 'source:'+url_for('test_endpoint', _external=True) + # Add our URL to the import page + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + + ##################### + # We want and

ONLY, but ignore span with .foobar-detection + + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": 'span,p', "url": test_url, "tag": "", "subtractive_selectors": ".foobar-detection", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + + time.sleep(sleep_time_for_fetch_thread) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + assert b'foobar-detection' not in res.data + assert b'<br' not in res.data + assert b'<p' in res.data \ No newline at end of file diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 3c7e89e4..e2043747 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -10,6 +10,7 @@ def set_original_response():

Which is across multiple lines


So let's see what happens.
+ """