Merge branch 'seconds' of https://github.com/ntmmfts/changedetection.io into seconds
This commit is contained in:
@@ -16,6 +16,8 @@ Open source web page monitoring, notification and change detection.
|
||||
|
||||
[](https://dashboard.heroku.com/new?template=https%3A%2F%2Fgithub.com%2Fdgtlmoon%2Fchangedetection.io%2Ftree%2Fmaster)
|
||||
|
||||
Read the [Heroku notes and limitations wiki page first](https://github.com/dgtlmoon/changedetection.io/wiki/Heroku-notes)
|
||||
|
||||
#### Example use cases
|
||||
|
||||
- Products and services have a change in pricing
|
||||
@@ -27,6 +29,7 @@ Open source web page monitoring, notification and change detection.
|
||||
- University/organisation news from their website
|
||||
- Detect and monitor changes in JSON API responses
|
||||
- API monitoring and alerting
|
||||
- Changes in legal and other documents
|
||||
- Trigger API calls via notifications when text appears on a website
|
||||
- Glue together APIs using the JSON filter and JSON notifications
|
||||
- Create RSS feeds based on changes in web content
|
||||
|
||||
@@ -405,7 +405,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
# Get the most recent one
|
||||
newest_history_key = datastore.get_val(uuid, 'newest_history_key')
|
||||
|
||||
# 0 means that theres only one, so that there should be no 'unviewed' history availabe
|
||||
# 0 means that theres only one, so that there should be no 'unviewed' history available
|
||||
if newest_history_key == 0:
|
||||
newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0]
|
||||
|
||||
@@ -418,7 +418,11 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
stripped_content = handler.strip_ignore_text(raw_content,
|
||||
datastore.data['watching'][uuid]['ignore_text'])
|
||||
|
||||
checksum = hashlib.md5(stripped_content).hexdigest()
|
||||
if datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||
checksum = hashlib.md5(stripped_content.translate(None, b'\r\n\t ')).hexdigest()
|
||||
else:
|
||||
checksum = hashlib.md5(stripped_content).hexdigest()
|
||||
|
||||
return checksum
|
||||
|
||||
return datastore.data['watching'][uuid]['previous_md5']
|
||||
@@ -554,6 +558,8 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
if request.method == 'GET':
|
||||
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
|
||||
form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
|
||||
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
|
||||
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
|
||||
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
|
||||
form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
|
||||
form.notification_title.data = datastore.data['settings']['application']['notification_title']
|
||||
@@ -580,6 +586,8 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
datastore.data['settings']['application']['notification_format'] = form.notification_format.data
|
||||
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
|
||||
datastore.data['settings']['application']['base_url'] = form.base_url.data
|
||||
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
|
||||
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
|
||||
|
||||
if form.trigger_check.data:
|
||||
if len(form.notification_urls.data):
|
||||
@@ -752,7 +760,8 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
from pathlib import Path
|
||||
|
||||
# Remove any existing backup file, for now we just keep one file
|
||||
for previous_backup_filename in Path(app.config['datastore_path']).rglob('changedetection-backup-*.zip'):
|
||||
|
||||
for previous_backup_filename in Path(datastore_o.datastore_path).rglob('changedetection-backup-*.zip'):
|
||||
os.unlink(previous_backup_filename)
|
||||
|
||||
# create a ZipFile object
|
||||
@@ -760,7 +769,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
|
||||
# We only care about UUIDS from the current index file
|
||||
uuids = list(datastore.data['watching'].keys())
|
||||
backup_filepath = os.path.join(app.config['datastore_path'], backupname)
|
||||
backup_filepath = os.path.join(datastore_o.datastore_path, backupname)
|
||||
|
||||
with zipfile.ZipFile(backup_filepath, "w",
|
||||
compression=zipfile.ZIP_DEFLATED,
|
||||
@@ -770,22 +779,22 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
datastore.sync_to_json()
|
||||
|
||||
# Add the index
|
||||
zipObj.write(os.path.join(app.config['datastore_path'], "url-watches.json"), arcname="url-watches.json")
|
||||
zipObj.write(os.path.join(datastore_o.datastore_path, "url-watches.json"), arcname="url-watches.json")
|
||||
|
||||
# Add the flask app secret
|
||||
zipObj.write(os.path.join(app.config['datastore_path'], "secret.txt"), arcname="secret.txt")
|
||||
zipObj.write(os.path.join(datastore_o.datastore_path, "secret.txt"), arcname="secret.txt")
|
||||
|
||||
# Add any snapshot data we find, use the full path to access the file, but make the file 'relative' in the Zip.
|
||||
for txt_file_path in Path(app.config['datastore_path']).rglob('*.txt'):
|
||||
for txt_file_path in Path(datastore_o.datastore_path).rglob('*.txt'):
|
||||
parent_p = txt_file_path.parent
|
||||
if parent_p.name in uuids:
|
||||
zipObj.write(txt_file_path,
|
||||
arcname=str(txt_file_path).replace(app.config['datastore_path'], ''),
|
||||
arcname=str(txt_file_path).replace(datastore_o.datastore_path, ''),
|
||||
compress_type=zipfile.ZIP_DEFLATED,
|
||||
compresslevel=8)
|
||||
|
||||
# Create a list file with just the URLs, so it's easier to port somewhere else in the future
|
||||
list_file = os.path.join(app.config['datastore_path'], "url-list.txt")
|
||||
list_file = os.path.join(datastore_o.datastore_path, "url-list.txt")
|
||||
with open(list_file, "w") as f:
|
||||
for uuid in datastore.data['watching']:
|
||||
url = datastore.data['watching'][uuid]['url']
|
||||
@@ -797,7 +806,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
compress_type=zipfile.ZIP_DEFLATED,
|
||||
compresslevel=8)
|
||||
|
||||
return send_from_directory(app.config['datastore_path'], backupname, as_attachment=True)
|
||||
return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True)
|
||||
|
||||
@app.route("/static/<string:group>/<string:filename>", methods=['GET'])
|
||||
def static_content(group, filename):
|
||||
|
||||
@@ -14,7 +14,8 @@ class EmptyReply(Exception):
|
||||
class Fetcher():
|
||||
error = None
|
||||
status_code = None
|
||||
content = None # Should be bytes?
|
||||
content = None # Should always be bytes.
|
||||
headers = None
|
||||
|
||||
fetcher_description ="No description"
|
||||
|
||||
@@ -68,9 +69,12 @@ class html_webdriver(Fetcher):
|
||||
|
||||
# Configs for Proxy setup
|
||||
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
|
||||
selenium_proxy_settings_mappings = ['ftpProxy', 'httpProxy', 'noProxy',
|
||||
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
|
||||
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
|
||||
'socksProxy', 'socksUsername', 'socksPassword']
|
||||
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
|
||||
|
||||
|
||||
|
||||
proxy=None
|
||||
|
||||
def __init__(self):
|
||||
@@ -110,6 +114,7 @@ class html_webdriver(Fetcher):
|
||||
# @todo - dom wait loaded?
|
||||
time.sleep(5)
|
||||
self.content = driver.page_source
|
||||
self.headers = {}
|
||||
|
||||
driver.quit()
|
||||
|
||||
@@ -126,7 +131,6 @@ class html_webdriver(Fetcher):
|
||||
# driver.quit() seems to cause better exceptions
|
||||
driver.quit()
|
||||
|
||||
|
||||
return True
|
||||
|
||||
# "html_requests" is listed as the default fetcher in store.py!
|
||||
@@ -143,6 +147,8 @@ class html_requests(Fetcher):
|
||||
timeout=timeout,
|
||||
verify=False)
|
||||
|
||||
# https://stackoverflow.com/questions/44203397/python-requests-get-returns-improperly-decoded-text-instead-of-utf-8
|
||||
# Return bytes here
|
||||
html = r.text
|
||||
|
||||
|
||||
@@ -152,4 +158,5 @@ class html_requests(Fetcher):
|
||||
|
||||
self.status_code = r.status_code
|
||||
self.content = html
|
||||
self.headers = r.headers
|
||||
|
||||
|
||||
@@ -58,8 +58,7 @@ class perform_site_check():
|
||||
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
|
||||
update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'],
|
||||
'history': {},
|
||||
update_obj = {
|
||||
"last_checked": timestamp
|
||||
}
|
||||
|
||||
@@ -104,9 +103,16 @@ class perform_site_check():
|
||||
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
||||
# return content().textfilter().jsonextract().checksumcompare() ?
|
||||
|
||||
is_html = True
|
||||
is_json = fetcher.headers.get('Content-Type', '') == 'application/json'
|
||||
is_html = not is_json
|
||||
css_filter_rule = watch['css_filter']
|
||||
if css_filter_rule and len(css_filter_rule.strip()):
|
||||
|
||||
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
|
||||
if is_json and not has_filter_rule:
|
||||
css_filter_rule = "json:$"
|
||||
has_filter_rule = True
|
||||
|
||||
if has_filter_rule:
|
||||
if 'json:' in css_filter_rule:
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
||||
is_html = False
|
||||
@@ -117,28 +123,39 @@ class perform_site_check():
|
||||
if is_html:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html_content = fetcher.content
|
||||
if css_filter_rule and len(css_filter_rule.strip()):
|
||||
if has_filter_rule:
|
||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
|
||||
# get_text() via inscriptis
|
||||
stripped_text_from_html = get_text(html_content)
|
||||
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
||||
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
||||
# in the future we'll implement other mechanisms.
|
||||
|
||||
update_obj["last_check_status"] = fetcher.get_last_status_code()
|
||||
update_obj["last_error"] = False
|
||||
|
||||
|
||||
# If there's text to skip
|
||||
# @todo we could abstract out the get_text() to handle this cleaner
|
||||
if len(watch['ignore_text']):
|
||||
stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, watch['ignore_text'])
|
||||
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
|
||||
if len(text_to_ignore):
|
||||
stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, text_to_ignore)
|
||||
else:
|
||||
stripped_text_from_html = stripped_text_from_html.encode('utf8')
|
||||
|
||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
||||
else:
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
|
||||
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
|
||||
# On the first run of a site, watch['previous_md5'] will be an empty string, set it the current one.
|
||||
if not len(watch['previous_md5']):
|
||||
watch['previous_md5'] = fetched_md5
|
||||
update_obj["previous_md5"] = fetched_md5
|
||||
|
||||
blocked_by_not_found_trigger_text = False
|
||||
|
||||
@@ -160,16 +177,12 @@ class perform_site_check():
|
||||
break
|
||||
|
||||
|
||||
# could be None or False depending on JSON type
|
||||
# On the first run of a site, watch['previous_md5'] will be an empty string
|
||||
|
||||
if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
|
||||
changed_detected = True
|
||||
|
||||
# Don't confuse people by updating as last-changed, when it actually just changed from None..
|
||||
if self.datastore.get_val(uuid, 'previous_md5'):
|
||||
update_obj["last_changed"] = timestamp
|
||||
|
||||
update_obj["previous_md5"] = fetched_md5
|
||||
update_obj["last_changed"] = timestamp
|
||||
|
||||
|
||||
# Extract title as title
|
||||
if is_html:
|
||||
@@ -178,4 +191,4 @@ class perform_site_check():
|
||||
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
|
||||
|
||||
|
||||
return changed_detected, update_obj, stripped_text_from_html
|
||||
return changed_detected, update_obj, text_content_before_ignored_filter
|
||||
|
||||
@@ -261,3 +261,5 @@ class globalSettingsForm(commonSettingsForm):
|
||||
[validators.NumberRange(min=1)])
|
||||
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
|
||||
base_url = StringField('Base URL', validators=[validators.Optional()])
|
||||
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
||||
ignore_whitespace = BooleanField('Ignore whitespace')
|
||||
@@ -45,6 +45,8 @@ class ChangeDetectionStore:
|
||||
'base_url' : None,
|
||||
'extract_title_as_title': False,
|
||||
'fetch_backend': 'html_requests',
|
||||
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||
'ignore_whitespace': False,
|
||||
'notification_urls': [], # Apprise URL list
|
||||
# Custom notification content
|
||||
'notification_title': None,
|
||||
@@ -369,6 +371,10 @@ class ChangeDetectionStore:
|
||||
import uuid
|
||||
|
||||
output_path = "{}/{}".format(self.datastore_path, watch_uuid)
|
||||
# Incase the operator deleted it, check and create.
|
||||
if not os.path.isdir(output_path):
|
||||
mkdir(output_path)
|
||||
|
||||
fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(contents)
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
<li class="tab" id="default-tab"><a href="#general">General</a></li>
|
||||
<li class="tab"><a href="#notifications">Notifications</a></li>
|
||||
<li class="tab"><a href="#fetching">Fetching</a></li>
|
||||
<li class="tab"><a href="#filters">Global Filters</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="box-wrap inner">
|
||||
@@ -65,6 +66,29 @@
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="tab-pane-inner" id="filters">
|
||||
|
||||
<fieldset class="pure-group">
|
||||
{{ render_field(form.ignore_whitespace) }}
|
||||
<span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/>
|
||||
<i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc.
|
||||
</span>
|
||||
</fieldset>
|
||||
|
||||
|
||||
<fieldset class="pure-group">
|
||||
{{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/>
|
||||
<span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/>
|
||||
Regular Expression support, wrap the line in forward slash <b>/regex/</b>.
|
||||
</span>
|
||||
</fieldset>
|
||||
</div>
|
||||
|
||||
<div id="actions">
|
||||
<div class="pure-control-group">
|
||||
<button type="submit" class="pure-button pure-button-primary">Save</button>
|
||||
|
||||
@@ -18,7 +18,8 @@ def cleanup(datastore_path):
|
||||
'url-watches.json',
|
||||
'notification.txt',
|
||||
'count.txt',
|
||||
'endpoint-content.txt']
|
||||
'endpoint-content.txt'
|
||||
]
|
||||
for file in files:
|
||||
try:
|
||||
os.unlink("{}/{}".format(datastore_path, file))
|
||||
|
||||
25
changedetectionio/tests/test_backup.py
Normal file
25
changedetectionio/tests/test_backup.py
Normal file
@@ -0,0 +1,25 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from urllib.request import urlopen
|
||||
from . util import set_original_response, set_modified_response, live_server_setup
|
||||
|
||||
|
||||
def test_backup(client, live_server):
|
||||
|
||||
live_server_setup(live_server)
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
res = client.get(
|
||||
url_for("get_backup"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Should get the right zip content type
|
||||
assert res.content_type == "application/zip"
|
||||
# Should be PK/ZIP stream
|
||||
assert res.data.count(b'PK') >= 2
|
||||
|
||||
@@ -151,3 +151,88 @@ def test_check_ignore_text_functionality(client, live_server):
|
||||
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_global_ignore_text_functionality(client, live_server):
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ"
|
||||
set_original_ignore_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Goto the settings page, add our ignore text
|
||||
res = client.post(
|
||||
url_for("settings_page"),
|
||||
data={
|
||||
"minutes_between_check": 180,
|
||||
"global_ignore_text": ignore_text,
|
||||
'fetch_backend': "html_requests"
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
# Goto the edit page of the item, add our ignore text
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
|
||||
# Check it saved
|
||||
res = client.get(
|
||||
url_for("settings_page"),
|
||||
)
|
||||
assert bytes(ignore_text.encode('utf-8')) in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
assert b'/test-endpoint' in res.data
|
||||
|
||||
# Make a change
|
||||
set_modified_ignore_response()
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
assert b'/test-endpoint' in res.data
|
||||
|
||||
# Just to be sure.. set a regular modified change..
|
||||
set_modified_original_ignore_response()
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
||||
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
96
changedetectionio/tests/test_ignorewhitespace.py
Normal file
96
changedetectionio/tests/test_ignorewhitespace.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
|
||||
def test_setup(live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
|
||||
# Should be the same as set_original_ignore_response() but with a little more whitespacing
|
||||
def set_original_ignore_response_but_with_whitespace():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>
|
||||
|
||||
|
||||
Which is across multiple lines</p>
|
||||
<br>
|
||||
</br>
|
||||
|
||||
So let's see what happens. </br>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
def set_original_ignore_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
|
||||
# If there was only a change in the whitespacing, then we shouldnt have a change detected
|
||||
def test_check_ignore_whitespace(client, live_server):
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
set_original_ignore_response()
|
||||
|
||||
# Goto the settings page, add our ignore text
|
||||
res = client.post(
|
||||
url_for("settings_page"),
|
||||
data={
|
||||
"minutes_between_check": 180,
|
||||
"ignore_whitespace": "y",
|
||||
'fetch_backend': "html_requests"
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
set_original_ignore_response_but_with_whitespace()
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
assert b'/test-endpoint' in res.data
|
||||
@@ -111,6 +111,21 @@ def set_original_response():
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
|
||||
def set_response_with_html():
|
||||
test_return_data = """
|
||||
{
|
||||
"test": [
|
||||
{
|
||||
"html": "<b>"
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
def set_modified_response():
|
||||
test_return_data = """
|
||||
{
|
||||
@@ -138,6 +153,37 @@ def set_modified_response():
|
||||
|
||||
return None
|
||||
|
||||
def test_check_json_without_filter(client, live_server):
|
||||
# Request a JSON document from a application/json source containing HTML
|
||||
# and be sure it doesn't get chewed up by instriptis
|
||||
set_response_with_html()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint_json', _external=True)
|
||||
client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(3)
|
||||
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b'"<b>' in res.data
|
||||
assert res.data.count(b'{\n') >= 2
|
||||
|
||||
|
||||
def test_check_json_filter(client, live_server):
|
||||
json_filter = 'json:boss.name'
|
||||
|
||||
|
||||
@@ -159,6 +159,9 @@ def test_check_notification(client, live_server):
|
||||
|
||||
with open("test-datastore/notification.txt", "r") as f:
|
||||
notification_submission = f.read()
|
||||
print ("Notification submission was:", notification_submission)
|
||||
# Re #342 - check for accidental python byte encoding of non-utf8/string
|
||||
assert "b'" not in notification_submission
|
||||
|
||||
assert re.search('Watch UUID: [0-9a-f]{8}(-[0-9a-f]{4}){3}-[0-9a-f]{12}', notification_submission, re.IGNORECASE)
|
||||
assert "Watch title: my title" in notification_submission
|
||||
|
||||
@@ -44,6 +44,16 @@ def live_server_setup(live_server):
|
||||
with open("test-datastore/endpoint-content.txt", "r") as f:
|
||||
return f.read()
|
||||
|
||||
@live_server.app.route('/test-endpoint-json')
|
||||
def test_endpoint_json():
|
||||
|
||||
from flask import make_response
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "r") as f:
|
||||
resp = make_response(f.read())
|
||||
resp.headers['Content-Type'] = 'application/json'
|
||||
return resp
|
||||
|
||||
# Just return the headers in the request
|
||||
@live_server.app.route('/test-headers')
|
||||
def test_headers():
|
||||
|
||||
@@ -2,7 +2,12 @@ import threading
|
||||
import queue
|
||||
import time
|
||||
|
||||
# Requests for checking on the site use a pool of thread Workers managed by a Queue.
|
||||
# A single update worker
|
||||
#
|
||||
# Requests for checking on a single site(watch) from a queue of watches
|
||||
# (another process inserts watches into the queue that are time-ready for checking)
|
||||
|
||||
|
||||
class update_worker(threading.Thread):
|
||||
current_uuid = None
|
||||
|
||||
@@ -39,6 +44,13 @@ class update_worker(threading.Thread):
|
||||
now = time.time()
|
||||
changed_detected, update_obj, contents = update_handler.run(uuid)
|
||||
|
||||
# Re #342
|
||||
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
|
||||
# We then convert/.decode('utf-8') for the notification etc
|
||||
if not isinstance(contents, (bytes, bytearray)):
|
||||
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
|
||||
|
||||
|
||||
# Always record that we atleast tried
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)})
|
||||
|
||||
@@ -52,74 +64,77 @@ class update_worker(threading.Thread):
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
|
||||
|
||||
else:
|
||||
if update_obj:
|
||||
try:
|
||||
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
|
||||
if changed_detected:
|
||||
n_object = {}
|
||||
# A change was detected
|
||||
fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents)
|
||||
try:
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
|
||||
# Update history with the stripped text for future reference, this will also mean we save the first
|
||||
# Should always be keyed by string(timestamp)
|
||||
self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}})
|
||||
# For the FIRST time we check a site, or a change detected, save the snapshot.
|
||||
if changed_detected or not watch['last_checked']:
|
||||
# A change was detected
|
||||
fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents)
|
||||
# Should always be keyed by string(timestamp)
|
||||
self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}})
|
||||
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
# Generally update anything interesting returned
|
||||
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
|
||||
|
||||
print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
|
||||
# A change was detected
|
||||
if changed_detected:
|
||||
n_object = {}
|
||||
print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
|
||||
|
||||
# Notifications should only trigger on the second time (first time, we gather the initial snapshot)
|
||||
if len(watch['history']) > 1:
|
||||
# Notifications should only trigger on the second time (first time, we gather the initial snapshot)
|
||||
if len(watch['history']) > 1:
|
||||
|
||||
dates = list(watch['history'].keys())
|
||||
# Convert to int, sort and back to str again
|
||||
# @todo replace datastore getter that does this automatically
|
||||
dates = [int(i) for i in dates]
|
||||
dates.sort(reverse=True)
|
||||
dates = [str(i) for i in dates]
|
||||
dates = list(watch['history'].keys())
|
||||
# Convert to int, sort and back to str again
|
||||
# @todo replace datastore getter that does this automatically
|
||||
dates = [int(i) for i in dates]
|
||||
dates.sort(reverse=True)
|
||||
dates = [str(i) for i in dates]
|
||||
|
||||
prev_fname = watch['history'][dates[1]]
|
||||
prev_fname = watch['history'][dates[1]]
|
||||
|
||||
|
||||
# Did it have any notification alerts to hit?
|
||||
if len(watch['notification_urls']):
|
||||
print(">>> Notifications queued for UUID from watch {}".format(uuid))
|
||||
n_object['notification_urls'] = watch['notification_urls']
|
||||
n_object['notification_title'] = watch['notification_title']
|
||||
n_object['notification_body'] = watch['notification_body']
|
||||
n_object['notification_format'] = watch['notification_format']
|
||||
# Did it have any notification alerts to hit?
|
||||
if len(watch['notification_urls']):
|
||||
print(">>> Notifications queued for UUID from watch {}".format(uuid))
|
||||
n_object['notification_urls'] = watch['notification_urls']
|
||||
n_object['notification_title'] = watch['notification_title']
|
||||
n_object['notification_body'] = watch['notification_body']
|
||||
n_object['notification_format'] = watch['notification_format']
|
||||
|
||||
# No? maybe theres a global setting, queue them all
|
||||
elif len(self.datastore.data['settings']['application']['notification_urls']):
|
||||
print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
|
||||
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
|
||||
n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
|
||||
n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
|
||||
n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
|
||||
# No? maybe theres a global setting, queue them all
|
||||
elif len(self.datastore.data['settings']['application']['notification_urls']):
|
||||
print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
|
||||
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
|
||||
n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
|
||||
n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
|
||||
n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
|
||||
else:
|
||||
print(">>> NO notifications queued, watch and global notification URLs were empty.")
|
||||
|
||||
# Only prepare to notify if the rules above matched
|
||||
if 'notification_urls' in n_object:
|
||||
# HTML needs linebreak, but MarkDown and Text can use a linefeed
|
||||
if n_object['notification_format'] == 'HTML':
|
||||
line_feed_sep = "</br>"
|
||||
else:
|
||||
print(">>> NO notifications queued, watch and global notification URLs were empty.")
|
||||
line_feed_sep = "\n"
|
||||
|
||||
# Only prepare to notify if the rules above matched
|
||||
if 'notification_urls' in n_object:
|
||||
# HTML needs linebreak, but MarkDown and Text can use a linefeed
|
||||
if n_object['notification_format'] == 'HTML':
|
||||
line_feed_sep = "</br>"
|
||||
else:
|
||||
line_feed_sep = "\n"
|
||||
from changedetectionio import diff
|
||||
n_object.update({
|
||||
'watch_url': watch['url'],
|
||||
'uuid': uuid,
|
||||
'current_snapshot': contents.decode('utf-8'),
|
||||
'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
|
||||
'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
|
||||
})
|
||||
|
||||
from changedetectionio import diff
|
||||
n_object.update({
|
||||
'watch_url': watch['url'],
|
||||
'uuid': uuid,
|
||||
'current_snapshot': str(contents),
|
||||
'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
|
||||
'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
|
||||
})
|
||||
self.notification_q.put(n_object)
|
||||
|
||||
self.notification_q.put(n_object)
|
||||
|
||||
except Exception as e:
|
||||
print("!!!! Exception in update_worker !!!\n", e)
|
||||
except Exception as e:
|
||||
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
|
||||
print("!!!! Exception in update_worker !!!\n", e)
|
||||
|
||||
self.current_uuid = None # Done
|
||||
self.q.task_done()
|
||||
|
||||
@@ -17,9 +17,9 @@ services:
|
||||
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
||||
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
||||
#
|
||||
# WebDriver proxy settings webdriver_ftpProxy, webdriver_httpProxy, webdriver_noProxy,
|
||||
# WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_httpProxy, webdriver_noProxy,
|
||||
# webdriver_proxyAutoconfigUrl, webdriver_sslProxy, webdriver_autodetect,
|
||||
# webdriver_socksProxy, webdriver_socksUsername, webdriver_socksPassword
|
||||
# webdriver_socksProxy, webdriver_socksUsername, webdriver_socksVersion, webdriver_socksPassword
|
||||
#
|
||||
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
||||
#
|
||||
|
||||
@@ -29,4 +29,5 @@ cryptography ~= 3.4
|
||||
# Used for CSS filtering, replace with soupsieve and lxml for xpath
|
||||
bs4
|
||||
|
||||
selenium ~= 3.141
|
||||
# 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
|
||||
selenium ~= 4.1.0
|
||||
|
||||
Reference in New Issue
Block a user