Adding support for change detection of HTML source-code via "source:https://website.com" prefix (#540)
This commit is contained in:
@@ -20,7 +20,7 @@ class perform_site_check():
|
||||
timestamp = int(time.time()) # used for storage etc too
|
||||
|
||||
changed_detected = False
|
||||
screenshot = False # as bytes
|
||||
screenshot = False # as bytes
|
||||
stripped_text_from_html = ""
|
||||
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
@@ -52,6 +52,12 @@ class perform_site_check():
|
||||
request_method = self.datastore.get_val(uuid, 'method')
|
||||
ignore_status_code = self.datastore.get_val(uuid, 'ignore_status_codes')
|
||||
|
||||
# source: support
|
||||
is_source = False
|
||||
if url.startswith('source:'):
|
||||
url = url.replace('source:', '')
|
||||
is_source = True
|
||||
|
||||
# Pluggable content fetcher
|
||||
prefer_backend = watch['fetch_backend']
|
||||
if hasattr(content_fetcher, prefer_backend):
|
||||
@@ -60,7 +66,6 @@ class perform_site_check():
|
||||
# If the klass doesnt exist, just use a default
|
||||
klass = getattr(content_fetcher, "html_requests")
|
||||
|
||||
|
||||
fetcher = klass()
|
||||
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code)
|
||||
# Fetching complete, now filters
|
||||
@@ -75,6 +80,12 @@ class perform_site_check():
|
||||
|
||||
is_json = 'application/json' in fetcher.headers.get('Content-Type', '')
|
||||
is_html = not is_json
|
||||
|
||||
# source: support, basically treat it as plaintext
|
||||
if is_source:
|
||||
is_html = False
|
||||
is_json = False
|
||||
|
||||
css_filter_rule = watch['css_filter']
|
||||
subtractive_selectors = watch.get(
|
||||
"subtractive_selectors", []
|
||||
@@ -94,7 +105,7 @@ class perform_site_check():
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
||||
is_html = False
|
||||
|
||||
if is_html:
|
||||
if is_html or is_source:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html_content = fetcher.content
|
||||
|
||||
@@ -113,15 +124,24 @@ class perform_site_check():
|
||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
if has_subtractive_selectors:
|
||||
html_content = html_tools.element_removal(subtractive_selectors, html_content)
|
||||
# extract text
|
||||
stripped_text_from_html = \
|
||||
html_tools.html_to_text(
|
||||
html_content,
|
||||
render_anchor_tag_content=self.datastore.data["settings"][
|
||||
"application"].get(
|
||||
"render_anchor_tag_content", False)
|
||||
)
|
||||
|
||||
|
||||
if not is_source:
|
||||
# extract text
|
||||
stripped_text_from_html = \
|
||||
html_tools.html_to_text(
|
||||
html_content,
|
||||
render_anchor_tag_content=self.datastore.data["settings"][
|
||||
"application"].get(
|
||||
"render_anchor_tag_content", False)
|
||||
)
|
||||
|
||||
elif is_source:
|
||||
stripped_text_from_html = html_content
|
||||
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
||||
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
||||
@@ -161,13 +181,11 @@ class perform_site_check():
|
||||
if result:
|
||||
blocked_by_not_found_trigger_text = False
|
||||
|
||||
|
||||
if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
|
||||
changed_detected = True
|
||||
update_obj["previous_md5"] = fetched_md5
|
||||
update_obj["last_changed"] = timestamp
|
||||
|
||||
|
||||
# Extract title as title
|
||||
if is_html:
|
||||
if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
|
||||
@@ -179,4 +197,4 @@ class perform_site_check():
|
||||
|
||||
fetcher.quit()
|
||||
|
||||
return changed_detected, update_obj, text_content_before_ignored_filter, screenshot
|
||||
return changed_detected, update_obj, text_content_before_ignored_filter, screenshot
|
||||
Reference in New Issue
Block a user