diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 907e0c91..48342d93 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Some common stuff here that can be moved to a base class +# (set_proxy_from_list) class perform_site_check(): def __init__(self, *args, datastore, **kwargs): @@ -45,6 +46,20 @@ class perform_site_check(): return proxy_args + # Doesn't look like python supports forward slash auto enclosure in re.findall + # So convert it to inline flag "foobar(?i)" type configuration + def forward_slash_enclosed_regex_to_options(self, regex): + res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE) + + if res: + regex = res.group(1) + regex += '(?{})'.format(res.group(2)) + else: + regex += '(?{})'.format('i') + + return regex + + def run(self, uuid): timestamp = int(time.time()) # used for storage etc too @@ -215,15 +230,27 @@ class perform_site_check(): if len(extract_text) > 0: regex_matched_output = [] for s_re in extract_text: - result = re.findall(s_re.encode('utf8'), stripped_text_from_html, - flags=re.MULTILINE | re.DOTALL | re.LOCALE) - if result: - regex_matched_output = regex_matched_output + result + # incase they specified something in '/.../x' + regex = self.forward_slash_enclosed_regex_to_options(s_re) + result = re.findall(regex.encode('utf-8'), stripped_text_from_html) + for l in result: + if type(l) is tuple: + #@todo - some formatter option default (between groups) + regex_matched_output += list(l) + [b'\n'] + else: + # @todo - some formatter option default (between each ungrouped result) + regex_matched_output += [l] + [b'\n'] + + # Now we will only show what the regex matched + stripped_text_from_html = b'' + text_content_before_ignored_filter = b'' if regex_matched_output: - stripped_text_from_html = b'\n'.join(regex_matched_output) + # @todo some formatter for presentation? + stripped_text_from_html = b''.join(regex_matched_output) text_content_before_ignored_filter = stripped_text_from_html + # Re #133 - if we should strip whitespaces from triggering the change detected comparison if self.datastore.data['settings']['application'].get('ignore_whitespace', False): fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index c706b0b2..7b5d0c4a 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -239,8 +239,15 @@ Unavailable") }} {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }} diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index d51200d3..aad29d51 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -15,7 +15,7 @@ def set_original_response(): So let's see what happens.