Re-test under HIDE_REFERER, use strtobool so you can use 'False'

Stability fix related to the new watch check count (#1113 )
Use deepcopy to stop possible data corruption (#1108 )
2022-11-12 13:38:11 +01:00 · 2022-11-10 20:01:07 +01:00 · 2022-11-08 12:18:38 +01:00 · 2022-11-07 20:43:20 +01:00 · 2022-11-06 09:48:07 +01:00 · 2022-11-05 12:22:52 +01:00
6 changed files with 81 additions and 68 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ## Web Site Change Detection, Monitoring and Notification.

-_Live your data-life pro-actively, track website and JSON content changes, trigger notifications via Discord, Email, Slack, Telegram, API calls and 70+ more._
+_Live your data-life pro-actively, Detect website changes and perform meaningful actions, trigger notifications via Discord, Email, Slack, Telegram, API calls and many more._


 [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring"  title="Self-hosted web page change monitoring"  />](https://lemonade.changedetection.io/start?src=github)
--- a/changedetectionio/changedetection.py
+++ b/changedetectionio/changedetection.py
@@ -2,19 +2,20 @@

 # Launch as a eventlet.wsgi server instance.

+from distutils.util import strtobool
+import eventlet
+import eventlet.wsgi
 import getopt
 import os
 import signal
 import sys

-import eventlet
-import eventlet.wsgi
 from . import store, changedetection_app, content_fetcher
 from . import __version__

 # Only global so we can access it in the signal handler
-datastore = None
 app = None
+datastore = None

 def sigterm_handler(_signo, _stack_frame):
    global app
@@ -106,8 +107,9 @@ def main():
    # @Note: Incompatible with password login (and maybe other features) for now, submit a PR!
    @app.after_request
    def hide_referrer(response):
-        if os.getenv("HIDE_REFERER", False):
+        if strtobool(os.getenv("HIDE_REFERER", False)):
            response.headers["Referrer-Policy"] = "no-referrer"
+
        return response

    # Proxy sub-directory support
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -15,7 +15,6 @@ class FilterNotFoundInResponse(ValueError):
        ValueError.__init__(self, msg)


-
 # Some common stuff here that can be moved to a base class
 # (set_proxy_from_list)
 class perform_site_check():
@@ -39,18 +38,20 @@ class perform_site_check():

        return regex

-
    def run(self, uuid):
+        from copy import deepcopy
        changed_detected = False
        screenshot = False  # as bytes
        stripped_text_from_html = ""

-        watch = self.datastore.data['watching'].get(uuid)
+        # DeepCopy so we can be sure we don't accidently change anything by reference
+        watch = deepcopy(self.datastore.data['watching'].get(uuid))
+
        if not watch:
            return

        # Protect against file:// access
-        if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
+        if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
            raise Exception(
                "file:// type access is denied for security reasons."
            )
@@ -58,10 +59,10 @@ class perform_site_check():
        # Unset any existing notification error
        update_obj = {'last_notification_error': False, 'last_error': False}

-        extra_headers =self.datastore.data['watching'][uuid].get('headers')
+        extra_headers = watch.get('headers', [])

        # Tweak the base config with the per-watch ones
-        request_headers = self.datastore.data['settings']['headers'].copy()
+        request_headers = deepcopy(self.datastore.data['settings']['headers'])
        request_headers.update(extra_headers)

        # https://github.com/psf/requests/issues/4525
@@ -85,7 +86,7 @@ class perform_site_check():
            is_source = True

        # Pluggable content fetcher
-        prefer_backend = watch['fetch_backend']
+        prefer_backend = watch.get('fetch_backend')
        if hasattr(content_fetcher, prefer_backend):
            klass = getattr(content_fetcher, prefer_backend)
        else:
@@ -96,21 +97,21 @@ class perform_site_check():
        proxy_url = None
        if proxy_id:
            proxy_url = self.datastore.proxy_list.get(proxy_id).get('url')
-            print ("UUID {} Using proxy {}".format(uuid, proxy_url))
+            print("UUID {} Using proxy {}".format(uuid, proxy_url))

        fetcher = klass(proxy_override=proxy_url)

        # Configurable per-watch or global extra delay before extracting text (for webDriver types)
        system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
        if watch['webdriver_delay'] is not None:
-            fetcher.render_extract_delay = watch['webdriver_delay']
+            fetcher.render_extract_delay = watch.get('webdriver_delay')
        elif system_webdriver_delay is not None:
            fetcher.render_extract_delay = system_webdriver_delay

-        if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip():
-            fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code']
+        if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip():
+            fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code')

-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch['include_filters'])
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'))
        fetcher.quit()

        self.screenshot = fetcher.screenshot
@@ -134,7 +135,8 @@ class perform_site_check():
            is_html = False
            is_json = False

-        include_filters_rule = watch['include_filters']
+        include_filters_rule = watch.get('include_filters', [])
+        # include_filters_rule = watch['include_filters']
        subtractive_selectors = watch.get(
            "subtractive_selectors", []
        ) + self.datastore.data["settings"]["application"].get(
@@ -156,7 +158,7 @@ class perform_site_check():
                    is_html = False

        if is_html or is_source:
-            
+
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content)
            html_content = fetcher.content
@@ -178,8 +180,8 @@ class perform_site_check():
                        else:
                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                            html_content += html_tools.include_filters(include_filters=filter_rule,
-                                                                  html_content=fetcher.content,
-                                                                  append_pretty_line_formatting=not is_source)
+                                                                       html_content=fetcher.content,
+                                                                       append_pretty_line_formatting=not is_source)

                    if not html_content.strip():
                        raise FilterNotFoundInResponse(include_filters_rule)
@@ -191,12 +193,11 @@ class perform_site_check():
                    stripped_text_from_html = html_content
                else:
                    # extract text
+                    do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
                    stripped_text_from_html = \
                        html_tools.html_to_text(
                            html_content,
-                            render_anchor_tag_content=self.datastore.data["settings"][
-                                "application"].get(
-                                "render_anchor_tag_content", False)
+                            render_anchor_tag_content=do_anchor
                        )

        # Re #340 - return the content before the 'ignore text' was applied
@@ -231,7 +232,7 @@ class perform_site_check():

                for l in result:
                    if type(l) is tuple:
-                        #@todo - some formatter option default (between groups)
+                        # @todo - some formatter option default (between groups)
                        regex_matched_output += list(l) + [b'\n']
                    else:
                        # @todo - some formatter option default (between each ungrouped result)
@@ -245,7 +246,6 @@ class perform_site_check():
                stripped_text_from_html = b''.join(regex_matched_output)
                text_content_before_ignored_filter = stripped_text_from_html

-
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
@@ -255,29 +255,30 @@ class perform_site_check():
        ############ Blocking rules, after checksum #################
        blocked = False

-        if len(watch['trigger_text']):
+        trigger_text = watch.get('trigger_text', [])
+        if len(trigger_text):
            # Assume blocked
            blocked = True
            # Filter and trigger works the same, so reuse it
            # It should return the line numbers that match
            result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
-                                                  wordlist=watch['trigger_text'],
+                                                  wordlist=trigger_text,
                                                  mode="line numbers")
            # Unblock if the trigger was found
            if result:
                blocked = False

-
-        if len(watch['text_should_not_be_present']):
+        text_should_not_be_present = watch.get('text_should_not_be_present', [])
+        if len(text_should_not_be_present):
            # If anything matched, then we should block a change from happening
            result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
-                                                  wordlist=watch['text_should_not_be_present'],
+                                                  wordlist=text_should_not_be_present,
                                                  mode="line numbers")
            if result:
                blocked = True

        # The main thing that all this at the moment comes down to :)
-        if watch['previous_md5'] != fetched_md5:
+        if watch.get('previous_md5') != fetched_md5:
            changed_detected = True

        # Looks like something changed, but did it match all the rules?
@@ -286,7 +287,7 @@ class perform_site_check():

        # Extract title as title
        if is_html:
-            if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
+            if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']:
                if not watch['title'] or not len(watch['title']):
                    update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)

--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -16,42 +16,43 @@ class model(dict):
    __newest_history_key = None
    __history_n=0
    __base_config = {
-            'url': None,
-            'tag': None,
-            'last_checked': 0,
-            'paused': False,
-            'last_viewed': 0,  # history key value of the last viewed via the [diff] link
-            #'newest_history_key': 0,
-            'title': None,
-            'previous_md5': False,
-            'uuid': str(uuid.uuid4()),
-            'headers': {},  # Extra headers to send
+            #'history': {},  # Dict of timestamp and output stripped filename (removed)
+            #'newest_history_key': 0, (removed, taken from history.txt index)
            'body': None,
-            'method': 'GET',
-            #'history': {},  # Dict of timestamp and output stripped filename
+            'check_unique_lines': False, # On change-detected, compare against all history if its something new
+            'check_count': 0,
+            'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
+            'extract_text': [],  # Extract text by regex after filters
+            'extract_title_as_title': False,
+            'fetch_backend': None,
+            'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
+            'headers': {},  # Extra headers to send
            'ignore_text': [],  # List of text to ignore when calculating the comparison checksum
-            # Custom notification content
-            'notification_urls': [],  # List of URLs to add to the notification Queue (Usually AppRise)
-            'notification_title': None,
+            'include_filters': [],
+            'last_checked': 0,
+            'last_error': False,
+            'last_viewed': 0,  # history key value of the last viewed via the [diff] link
+            'method': 'GET',
+             # Custom notification content
            'notification_body': None,
            'notification_format': default_notification_format_for_watch,
            'notification_muted': False,
-            'include_filters': [],
-            'last_error': False,
-            'extract_text': [],  # Extract text by regex after filters
-            'subtractive_selectors': [],
-            'trigger_text': [],  # List of text or regex to wait for until a change is detected
-            'text_should_not_be_present': [], # Text that should not present
-            'fetch_backend': None,
-            'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
-            'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
-            'extract_title_as_title': False,
-            'check_unique_lines': False, # On change-detected, compare against all history if its something new
+            'notification_title': None,
+            'notification_urls': [],  # List of URLs to add to the notification Queue (Usually AppRise)
+            'paused': False,
+            'previous_md5': False,
            'proxy': None, # Preferred proxy connection
+            'subtractive_selectors': [],
+            'tag': None,
+            'text_should_not_be_present': [], # Text that should not present
            # Re #110, so then if this is set to None, we know to use the default value instead
            # Requires setting to None on submit if it's the same as the default
            # Should be all None by default, so we use the system default in this case.
            'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
+            'title': None,
+            'trigger_text': [],  # List of text or regex to wait for until a change is detected
+            'url': None,
+            'uuid': str(uuid.uuid4()),
            'webdriver_delay': None,
            'webdriver_js_execute_code': None, # Run before change-detection
        }
--- a/changedetectionio/run_all_tests.sh
+++ b/changedetectionio/run_all_tests.sh
@@ -24,6 +24,12 @@ echo "RUNNING WITH BASE_URL SET"
 export BASE_URL="https://really-unique-domain.io"
 pytest tests/test_notification.py

+
+# Re-run with HIDE_REFERER set - could affect login
+export HIDE_REFERER=True
+pytest tests/test_access_control.py
+
+
 # Now for the selenium and playwright/browserless fetchers
 # Note - this is not UI functional tests - just checking that each one can fetch the content

--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -282,16 +282,19 @@ class update_worker(threading.Thread):
                            self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})

+                    if self.datastore.data['watching'].get(uuid):
+                        # Always record that we atleast tried
+                        count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
+                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
+                                                                           'last_checked': round(time.time()),
+                                                                           'check_count': count
+                                                                           })

-                    # Always record that we atleast tried
-                    self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
-                                                                       'last_checked': round(time.time())})
-
-                    # Always save the screenshot if it's available
-                    if update_handler.screenshot:
-                        self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot)
-                    if update_handler.xpath_data:
-                        self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data)
+                        # Always save the screenshot if it's available
+                        if update_handler.screenshot:
+                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot)
+                        if update_handler.xpath_data:
+                            self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data)


                self.current_uuid = None  # Done
Author	SHA1	Message	Date
dgtlmoon	6cad2d9422	Re-test under HIDE_REFERER, use strtobool so you can use 'False'	2022-11-12 13:38:11 +01:00
dgtlmoon	359dcb63e3	Stability fix related to the new watch check count (#1113 )	2022-11-10 20:01:07 +01:00
dgtlmoon	b043d477dc	Use deepcopy to stop possible data corruption (#1108 )	2022-11-08 12:18:38 +01:00
dgtlmoon	06bcfb28e5	Code- Use dict .get instead of key	2022-11-07 20:43:20 +01:00
dgtlmoon	ca3b351bae	Adding a check counter to watch fetching (#1099 )	2022-11-06 09:48:07 +01:00
dgtlmoon	b7e0f0a5e4	Update README.md	2022-11-05 12:22:52 +01:00