New feature - "Extract text" filter ability (#624)
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,5 +8,6 @@ __pycache__
|
|||||||
build
|
build
|
||||||
dist
|
dist
|
||||||
venv
|
venv
|
||||||
|
test-datastore
|
||||||
*.egg-info*
|
*.egg-info*
|
||||||
.vscode/settings.json
|
.vscode/settings.json
|
||||||
|
|||||||
@@ -204,6 +204,20 @@ class perform_site_check():
|
|||||||
else:
|
else:
|
||||||
stripped_text_from_html = stripped_text_from_html.encode('utf8')
|
stripped_text_from_html = stripped_text_from_html.encode('utf8')
|
||||||
|
|
||||||
|
# 615 Extract text by regex
|
||||||
|
extract_text = watch.get('extract_text', [])
|
||||||
|
if len(extract_text) > 0:
|
||||||
|
regex_matched_output = []
|
||||||
|
for s_re in extract_text:
|
||||||
|
result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
|
||||||
|
flags=re.MULTILINE | re.DOTALL | re.LOCALE)
|
||||||
|
if result:
|
||||||
|
regex_matched_output.append(result[0])
|
||||||
|
|
||||||
|
if regex_matched_output:
|
||||||
|
stripped_text_from_html = b'\n'.join(regex_matched_output)
|
||||||
|
text_content_before_ignored_filter = stripped_text_from_html
|
||||||
|
|
||||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||||
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||||
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
||||||
@@ -221,6 +235,7 @@ class perform_site_check():
|
|||||||
# Yeah, lets block first until something matches
|
# Yeah, lets block first until something matches
|
||||||
blocked_by_not_found_trigger_text = True
|
blocked_by_not_found_trigger_text = True
|
||||||
# Filter and trigger works the same, so reuse it
|
# Filter and trigger works the same, so reuse it
|
||||||
|
# It should return the line numbers that match
|
||||||
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
||||||
wordlist=watch['trigger_text'],
|
wordlist=watch['trigger_text'],
|
||||||
mode="line numbers")
|
mode="line numbers")
|
||||||
|
|||||||
@@ -223,7 +223,7 @@ class validateURL(object):
|
|||||||
except validators.ValidationFailure:
|
except validators.ValidationFailure:
|
||||||
message = field.gettext('\'%s\' is not a valid URL.' % (field.data.strip()))
|
message = field.gettext('\'%s\' is not a valid URL.' % (field.data.strip()))
|
||||||
raise ValidationError(message)
|
raise ValidationError(message)
|
||||||
|
|
||||||
class ValidateListRegex(object):
|
class ValidateListRegex(object):
|
||||||
"""
|
"""
|
||||||
Validates that anything that looks like a regex passes as a regex
|
Validates that anything that looks like a regex passes as a regex
|
||||||
@@ -330,6 +330,9 @@ class watchForm(commonSettingsForm):
|
|||||||
css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='')
|
css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='')
|
||||||
|
|
||||||
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
|
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
|
||||||
|
|
||||||
|
extract_text = StringListField('Extract text', [ValidateListRegex()])
|
||||||
|
|
||||||
title = StringField('Title', default='')
|
title = StringField('Title', default='')
|
||||||
|
|
||||||
ignore_text = StringListField('Ignore text', [ValidateListRegex()])
|
ignore_text = StringListField('Ignore text', [ValidateListRegex()])
|
||||||
|
|||||||
@@ -35,7 +35,8 @@ class model(dict):
|
|||||||
'notification_title': default_notification_title,
|
'notification_title': default_notification_title,
|
||||||
'notification_body': default_notification_body,
|
'notification_body': default_notification_body,
|
||||||
'notification_format': default_notification_format,
|
'notification_format': default_notification_format,
|
||||||
'css_filter': "",
|
'css_filter': '',
|
||||||
|
'extract_text': [], # Extract text by regex after filters
|
||||||
'subtractive_selectors': [],
|
'subtractive_selectors': [],
|
||||||
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
||||||
'fetch_backend': None,
|
'fetch_backend': None,
|
||||||
|
|||||||
@@ -199,6 +199,17 @@ nav
|
|||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
{{ render_field(form.extract_text, rows=5, placeholder="/some.regex\d{2}/ case-insensitive regex") }}
|
||||||
|
<span class="pure-form-message-inline">
|
||||||
|
<ul>
|
||||||
|
<li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
|
||||||
|
<li>One line per regular-expression.</li>
|
||||||
|
</ul>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</fieldset>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
|
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
|
||||||
|
|||||||
127
changedetectionio/tests/test_extract_regex.py
Normal file
127
changedetectionio/tests/test_extract_regex.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import time
|
||||||
|
from flask import url_for
|
||||||
|
from .util import live_server_setup
|
||||||
|
|
||||||
|
from ..html_tools import *
|
||||||
|
|
||||||
|
|
||||||
|
def set_original_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div id="sametext">Some text thats the same</div>
|
||||||
|
<div id="changetext">Some text that will change</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def set_modified_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>which has this one new line</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div id="sametext">Some text thats the same</div>
|
||||||
|
<div id="changetext">Some text that did change ( 1000 online <br/> 80 guests)</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_filter_and_regex_extract(client, live_server):
|
||||||
|
sleep_time_for_fetch_thread = 3
|
||||||
|
|
||||||
|
live_server_setup(live_server)
|
||||||
|
css_filter = "#changetext"
|
||||||
|
|
||||||
|
set_original_response()
|
||||||
|
|
||||||
|
# Give the endpoint time to spin up
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# Goto the edit page, add our ignore text
|
||||||
|
# Add our URL to the import page
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"css_filter": css_filter,
|
||||||
|
'extract_text': '\d+ online\n\d+ guests',
|
||||||
|
"url": test_url,
|
||||||
|
"tag": "",
|
||||||
|
"headers": "",
|
||||||
|
'fetch_backend': "html_requests"
|
||||||
|
},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
|
||||||
|
# Check it saved
|
||||||
|
res = client.get(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
)
|
||||||
|
assert b'\d+ online' in res.data
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
# client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# Make a change
|
||||||
|
set_modified_response()
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# It should have 'unviewed' still
|
||||||
|
# Because it should be looking at only that 'sametext' id
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'unviewed' in res.data
|
||||||
|
|
||||||
|
# Check HTML conversion detected and workd
|
||||||
|
res = client.get(
|
||||||
|
url_for("preview_page", uuid="first"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Class will be blank for now because the frontend didnt apply the diff
|
||||||
|
assert b'<div class="">1000 online' in res.data
|
||||||
|
|
||||||
|
# Both regexs should be here
|
||||||
|
assert b'<div class="">80 guests' in res.data
|
||||||
|
|
||||||
|
# Should not be here
|
||||||
|
assert b'Some text that did change' not in res.data
|
||||||
Reference in New Issue
Block a user