Compare commits
11 Commits
bug/RSS-fe
...
scrub-sing
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d2b7958cc2 | ||
|
|
541796cd5c | ||
|
|
1ab70f8e86 | ||
|
|
8227c012a7 | ||
|
|
c113d5fb24 | ||
|
|
8c8d4066d7 | ||
|
|
277dc9e1c1 | ||
|
|
fc0fd1ce9d | ||
|
|
bd6127728a | ||
|
|
4101ae00c6 | ||
|
|
62f14df3cb |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,5 +8,6 @@ __pycache__
|
||||
build
|
||||
dist
|
||||
venv
|
||||
test-datastore
|
||||
*.egg-info*
|
||||
.vscode/settings.json
|
||||
|
||||
@@ -351,7 +351,9 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
latest_fname = watch.history[dates[-1]]
|
||||
|
||||
html_diff = diff.render_diff(prev_fname, latest_fname, include_equal=False, line_feed_sep="</br>")
|
||||
fe.description(description="<![CDATA[<html><body><h4>{}</h4>{}</body></html>".format(watch_title, html_diff))
|
||||
fe.description(description="<![CDATA["
|
||||
"<html><body><h4>{}</h4>{}</body></html>"
|
||||
"]]>".format(watch_title, html_diff))
|
||||
|
||||
fe.guid(guid, permalink=False)
|
||||
dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key))
|
||||
@@ -456,6 +458,19 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
|
||||
return 'OK'
|
||||
|
||||
|
||||
@app.route("/scrub/<string:uuid>", methods=['GET'])
|
||||
@login_required
|
||||
def scrub_watch(uuid):
|
||||
try:
|
||||
datastore.scrub_watch(uuid)
|
||||
except KeyError:
|
||||
flash('Watch not found', 'error')
|
||||
else:
|
||||
flash("Scrubbed watch {}".format(uuid))
|
||||
|
||||
return redirect(url_for('index'))
|
||||
|
||||
@app.route("/scrub", methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def scrub_page():
|
||||
@@ -807,7 +822,13 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
|
||||
screenshot_url = datastore.get_screenshot(uuid)
|
||||
|
||||
output = render_template("diff.html", watch_a=watch,
|
||||
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
|
||||
|
||||
is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
|
||||
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
|
||||
|
||||
output = render_template("diff.html",
|
||||
watch_a=watch,
|
||||
newest=newest_version_file_contents,
|
||||
previous=previous_version_file_contents,
|
||||
extra_stylesheets=extra_stylesheets,
|
||||
@@ -818,7 +839,8 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
current_diff_url=watch['url'],
|
||||
extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
|
||||
left_sticky=True,
|
||||
screenshot=screenshot_url)
|
||||
screenshot=screenshot_url,
|
||||
is_html_webdriver=is_html_webdriver)
|
||||
|
||||
return output
|
||||
|
||||
@@ -879,6 +901,11 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
content.append({'line': "No history found", 'classes': ''})
|
||||
|
||||
screenshot_url = datastore.get_screenshot(uuid)
|
||||
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
|
||||
|
||||
is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
|
||||
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
|
||||
|
||||
output = render_template("preview.html",
|
||||
content=content,
|
||||
extra_stylesheets=extra_stylesheets,
|
||||
@@ -887,8 +914,9 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
current_diff_url=watch['url'],
|
||||
screenshot=screenshot_url,
|
||||
watch=watch,
|
||||
uuid=uuid)
|
||||
|
||||
uuid=uuid,
|
||||
is_html_webdriver=is_html_webdriver)
|
||||
|
||||
return output
|
||||
|
||||
@app.route("/settings/notification-logs", methods=['GET'])
|
||||
|
||||
@@ -204,6 +204,20 @@ class perform_site_check():
|
||||
else:
|
||||
stripped_text_from_html = stripped_text_from_html.encode('utf8')
|
||||
|
||||
# 615 Extract text by regex
|
||||
extract_text = watch.get('extract_text', [])
|
||||
if len(extract_text) > 0:
|
||||
regex_matched_output = []
|
||||
for s_re in extract_text:
|
||||
result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
|
||||
flags=re.MULTILINE | re.DOTALL | re.LOCALE)
|
||||
if result:
|
||||
regex_matched_output.append(result[0])
|
||||
|
||||
if regex_matched_output:
|
||||
stripped_text_from_html = b'\n'.join(regex_matched_output)
|
||||
text_content_before_ignored_filter = stripped_text_from_html
|
||||
|
||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
||||
@@ -221,6 +235,7 @@ class perform_site_check():
|
||||
# Yeah, lets block first until something matches
|
||||
blocked_by_not_found_trigger_text = True
|
||||
# Filter and trigger works the same, so reuse it
|
||||
# It should return the line numbers that match
|
||||
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
||||
wordlist=watch['trigger_text'],
|
||||
mode="line numbers")
|
||||
|
||||
@@ -223,7 +223,7 @@ class validateURL(object):
|
||||
except validators.ValidationFailure:
|
||||
message = field.gettext('\'%s\' is not a valid URL.' % (field.data.strip()))
|
||||
raise ValidationError(message)
|
||||
|
||||
|
||||
class ValidateListRegex(object):
|
||||
"""
|
||||
Validates that anything that looks like a regex passes as a regex
|
||||
@@ -330,6 +330,9 @@ class watchForm(commonSettingsForm):
|
||||
css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='')
|
||||
|
||||
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
|
||||
|
||||
extract_text = StringListField('Extract text', [ValidateListRegex()])
|
||||
|
||||
title = StringField('Title', default='')
|
||||
|
||||
ignore_text = StringListField('Ignore text', [ValidateListRegex()])
|
||||
|
||||
@@ -35,7 +35,8 @@ class model(dict):
|
||||
'notification_title': default_notification_title,
|
||||
'notification_body': default_notification_body,
|
||||
'notification_format': default_notification_format,
|
||||
'css_filter': "",
|
||||
'css_filter': '',
|
||||
'extract_text': [], # Extract text by regex after filters
|
||||
'subtractive_selectors': [],
|
||||
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
||||
'fetch_backend': None,
|
||||
|
||||
17
changedetectionio/static/js/diff-overview.js
Normal file
17
changedetectionio/static/js/diff-overview.js
Normal file
@@ -0,0 +1,17 @@
|
||||
$(document).ready(function () {
|
||||
// Load it when the #screenshot tab is in use, so we dont give a slow experience when waiting for the text diff to load
|
||||
window.addEventListener('hashchange', function (e) {
|
||||
toggle(location.hash);
|
||||
}, false);
|
||||
|
||||
toggle(location.hash);
|
||||
|
||||
function toggle(hash_name) {
|
||||
if (hash_name === '#screenshot') {
|
||||
$("img#screenshot-img").attr('src', screenshot_url);
|
||||
$("#settings").hide();
|
||||
} else {
|
||||
$("#settings").show();
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -49,6 +49,8 @@ $(document).ready(function() {
|
||||
}
|
||||
state_clicked=false;
|
||||
ctx.clearRect(0, 0, c.width, c.height);
|
||||
xctx.clearRect(0, 0, c.width, c.height);
|
||||
$("#css_filter").val('');
|
||||
});
|
||||
|
||||
|
||||
|
||||
@@ -254,12 +254,23 @@ class ChangeDetectionStore:
|
||||
def scrub_watch(self, uuid):
|
||||
import pathlib
|
||||
|
||||
self.__data['watching'][uuid].update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'previous_md5': False})
|
||||
self.needs_write_urgent = True
|
||||
self.__data['watching'][uuid].update(
|
||||
{'last_checked': 0,
|
||||
'last_changed': 0,
|
||||
'last_viewed': 0,
|
||||
'previous_md5': False,
|
||||
'last_notification_error': False,
|
||||
'last_error': False})
|
||||
|
||||
for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"):
|
||||
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
|
||||
for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
|
||||
unlink(item)
|
||||
|
||||
# Force the attr to recalculate
|
||||
bump = self.__data['watching'][uuid].history
|
||||
|
||||
self.needs_write_urgent = True
|
||||
|
||||
def add_watch(self, url, tag="", extras=None, write_to_disk_now=True):
|
||||
|
||||
if extras is None:
|
||||
@@ -287,7 +298,7 @@ class ChangeDetectionStore:
|
||||
'body', 'method',
|
||||
'ignore_text', 'css_filter',
|
||||
'subtractive_selectors', 'trigger_text',
|
||||
'extract_title_as_title']:
|
||||
'extract_title_as_title', 'extract_text']:
|
||||
if res.get(k):
|
||||
apply_extras[k] = res[k]
|
||||
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<script>
|
||||
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
|
||||
</script>
|
||||
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
|
||||
|
||||
<div id="settings">
|
||||
<h1>Differences</h1>
|
||||
<form class="pure-form " action="" method="GET">
|
||||
@@ -39,6 +44,7 @@
|
||||
<div class="tabs">
|
||||
<ul>
|
||||
<li class="tab" id="default-tab"><a href="#text">Text</a></li>
|
||||
<li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
@@ -60,6 +66,21 @@
|
||||
</table>
|
||||
Diff algorithm from the amazing <a href="https://github.com/kpdecker/jsdiff">github.com/kpdecker/jsdiff</a>
|
||||
</div>
|
||||
<div class="tab-pane-inner" id="screenshot">
|
||||
<div class="tip">
|
||||
For now, Differences are performed on text, not graphically, only the latest screenshot is available.
|
||||
</div>
|
||||
</br>
|
||||
{% if is_html_webdriver %}
|
||||
{% if screenshot %}
|
||||
<img style="max-width: 80%" id="screenshot-img" alt="Current screenshot from most recent request"/>
|
||||
{% else %}
|
||||
No screenshot available just yet! Try rechecking the page.
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<strong>Screenshot requires Playwright/WebDriver enabled</strong>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@@ -199,6 +199,17 @@ nav
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
|
||||
<li>One line per regular-expression.</li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
</div>
|
||||
|
||||
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
|
||||
@@ -248,6 +259,8 @@ nav
|
||||
|
||||
<a href="{{url_for('form_delete', uuid=uuid)}}"
|
||||
class="pure-button button-small button-error ">Delete</a>
|
||||
<a href="{{url_for('scrub_watch', uuid=uuid)}}"
|
||||
class="pure-button button-small button-error ">Scrub</a>
|
||||
<a href="{{url_for('form_clone', uuid=uuid)}}"
|
||||
class="pure-button button-small ">Create Copy</a>
|
||||
</div>
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<script>
|
||||
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
|
||||
</script>
|
||||
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
|
||||
|
||||
<div id="settings">
|
||||
<h1>Current - {{watch.last_checked|format_timestamp_timeago}}</h1>
|
||||
@@ -10,6 +14,7 @@
|
||||
<div class="tabs">
|
||||
<ul>
|
||||
<li class="tab" id="default-tab"><a href="#text">Text</a></li>
|
||||
<li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
@@ -28,5 +33,20 @@
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div class="tab-pane-inner" id="screenshot">
|
||||
<div class="tip">
|
||||
For now, Differences are performed on text, not graphically, only the latest screenshot is available.
|
||||
</div>
|
||||
</br>
|
||||
{% if is_html_webdriver %}
|
||||
{% if screenshot %}
|
||||
<img style="max-width: 80%" id="screenshot-img" alt="Current screenshot from most recent request"/>
|
||||
{% else %}
|
||||
No screenshot available just yet! Try rechecking the page.
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<strong>Screenshot requires Playwright/WebDriver enabled</strong>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
127
changedetectionio/tests/test_extract_regex.py
Normal file
127
changedetectionio/tests/test_extract_regex.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup
|
||||
|
||||
from ..html_tools import *
|
||||
|
||||
|
||||
def set_original_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div id="sametext">Some text thats the same</div>
|
||||
<div id="changetext">Some text that will change</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
|
||||
def set_modified_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>which has this one new line</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div id="sametext">Some text thats the same</div>
|
||||
<div id="changetext">Some text that did change ( 1000 online <br/> 80 guests)</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def test_check_filter_and_regex_extract(client, live_server):
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
live_server_setup(live_server)
|
||||
css_filter = "#changetext"
|
||||
|
||||
set_original_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Goto the edit page, add our ignore text
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": css_filter,
|
||||
'extract_text': '\d+ online\n\d+ guests',
|
||||
"url": test_url,
|
||||
"tag": "",
|
||||
"headers": "",
|
||||
'fetch_backend': "html_requests"
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Updated watch." in res.data
|
||||
|
||||
# Check it saved
|
||||
res = client.get(
|
||||
url_for("edit_page", uuid="first"),
|
||||
)
|
||||
assert b'\d+ online' in res.data
|
||||
|
||||
# Trigger a check
|
||||
# client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Make a change
|
||||
set_modified_response()
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# It should have 'unviewed' still
|
||||
# Because it should be looking at only that 'sametext' id
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
||||
|
||||
# Check HTML conversion detected and workd
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Class will be blank for now because the frontend didnt apply the diff
|
||||
assert b'<div class="">1000 online' in res.data
|
||||
|
||||
# Both regexs should be here
|
||||
assert b'<div class="">80 guests' in res.data
|
||||
|
||||
# Should not be here
|
||||
assert b'Some text that did change' not in res.data
|
||||
@@ -56,7 +56,10 @@ class update_worker(threading.Thread):
|
||||
except content_fetcher.ReplyWithContentButNoText as e:
|
||||
# Totally fine, it's by choice - just continue on, nothing more to care about
|
||||
# Page had elements/content but no renderable text
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
|
||||
if self.datastore.data['watching'][uuid].get('css_filter'):
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (CSS / xPath Filter not found in page?)"})
|
||||
else:
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
|
||||
pass
|
||||
except content_fetcher.EmptyReply as e:
|
||||
# Some kind of custom to-str handler in the exception handler that does this?
|
||||
|
||||
Reference in New Issue
Block a user