Add filter to remove elements by CSS rule from HTML before change detection is run (#445)
This commit is contained in:
@@ -35,6 +35,7 @@ from flask import (
|
|||||||
url_for,
|
url_for,
|
||||||
)
|
)
|
||||||
from flask_login import login_required
|
from flask_login import login_required
|
||||||
|
|
||||||
from changedetectionio import html_tools
|
from changedetectionio import html_tools
|
||||||
|
|
||||||
__version__ = '0.39.9'
|
__version__ = '0.39.9'
|
||||||
@@ -526,6 +527,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
|
|
||||||
|
|
||||||
datastore.data['watching'][uuid]['css_filter'] = form.css_filter.data.strip()
|
datastore.data['watching'][uuid]['css_filter'] = form.css_filter.data.strip()
|
||||||
|
datastore.data['watching'][uuid]['subtractive_selectors'] = form.subtractive_selectors.data
|
||||||
|
|
||||||
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
|
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
|
||||||
if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']:
|
if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']:
|
||||||
@@ -598,6 +600,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
if request.method == 'GET':
|
if request.method == 'GET':
|
||||||
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
|
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
|
||||||
form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
|
form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
|
||||||
|
form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
|
||||||
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
|
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
|
||||||
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
|
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
|
||||||
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
|
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
|
||||||
@@ -626,6 +629,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
datastore.data['settings']['application']['notification_format'] = form.notification_format.data
|
datastore.data['settings']['application']['notification_format'] = form.notification_format.data
|
||||||
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
|
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
|
||||||
datastore.data['settings']['application']['base_url'] = form.base_url.data
|
datastore.data['settings']['application']['base_url'] = form.base_url.data
|
||||||
|
datastore.data['settings']['application']['global_subtractive_selectors'] = form.global_subtractive_selectors.data
|
||||||
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
|
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
|
||||||
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
|
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
import time
|
|
||||||
from changedetectionio import content_fetcher
|
|
||||||
from changedetectionio import html_tools
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from inscriptis import get_text
|
|
||||||
import urllib3
|
|
||||||
from . import html_tools
|
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
from inscriptis import get_text
|
||||||
|
|
||||||
|
from changedetectionio import content_fetcher, html_tools
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
@@ -72,8 +72,15 @@ class perform_site_check():
|
|||||||
is_json = 'application/json' in fetcher.headers.get('Content-Type', '')
|
is_json = 'application/json' in fetcher.headers.get('Content-Type', '')
|
||||||
is_html = not is_json
|
is_html = not is_json
|
||||||
css_filter_rule = watch['css_filter']
|
css_filter_rule = watch['css_filter']
|
||||||
|
subtractive_selectors = watch.get(
|
||||||
|
"subtractive_selectors", []
|
||||||
|
) + self.datastore.data["settings"]["application"].get(
|
||||||
|
"global_subtractive_selectors", []
|
||||||
|
)
|
||||||
|
|
||||||
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
|
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
|
||||||
|
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
|
||||||
|
|
||||||
if is_json and not has_filter_rule:
|
if is_json and not has_filter_rule:
|
||||||
css_filter_rule = "json:$"
|
css_filter_rule = "json:$"
|
||||||
has_filter_rule = True
|
has_filter_rule = True
|
||||||
@@ -100,11 +107,11 @@ class perform_site_check():
|
|||||||
else:
|
else:
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||||
|
if has_subtractive_selectors:
|
||||||
|
html_content = html_tools.element_removal(subtractive_selectors, html_content)
|
||||||
# get_text() via inscriptis
|
# get_text() via inscriptis
|
||||||
stripped_text_from_html = get_text(html_content)
|
stripped_text_from_html = get_text(html_content)
|
||||||
|
|
||||||
|
|
||||||
# Re #340 - return the content before the 'ignore text' was applied
|
# Re #340 - return the content before the 'ignore text' was applied
|
||||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,30 @@
|
|||||||
from wtforms import Form, SelectField, RadioField, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \
|
|
||||||
Field
|
|
||||||
|
|
||||||
from wtforms import widgets, SubmitField
|
|
||||||
from wtforms.validators import ValidationError
|
|
||||||
from wtforms.fields import html5
|
|
||||||
from changedetectionio import content_fetcher
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from changedetectionio.notification import default_notification_format, valid_notification_formats, default_notification_body, default_notification_title
|
from wtforms import (
|
||||||
|
BooleanField,
|
||||||
|
Field,
|
||||||
|
Form,
|
||||||
|
IntegerField,
|
||||||
|
PasswordField,
|
||||||
|
RadioField,
|
||||||
|
SelectField,
|
||||||
|
StringField,
|
||||||
|
SubmitField,
|
||||||
|
TextAreaField,
|
||||||
|
fields,
|
||||||
|
validators,
|
||||||
|
widgets,
|
||||||
|
)
|
||||||
|
from wtforms.fields import html5
|
||||||
|
from wtforms.validators import ValidationError
|
||||||
|
|
||||||
|
from changedetectionio import content_fetcher
|
||||||
|
from changedetectionio.notification import (
|
||||||
|
default_notification_body,
|
||||||
|
default_notification_format,
|
||||||
|
default_notification_title,
|
||||||
|
valid_notification_formats,
|
||||||
|
)
|
||||||
|
|
||||||
valid_method = {
|
valid_method = {
|
||||||
'GET',
|
'GET',
|
||||||
@@ -45,8 +62,8 @@ class SaltyPasswordField(StringField):
|
|||||||
encrypted_password = ""
|
encrypted_password = ""
|
||||||
|
|
||||||
def build_password(self, password):
|
def build_password(self, password):
|
||||||
import hashlib
|
|
||||||
import base64
|
import base64
|
||||||
|
import hashlib
|
||||||
import secrets
|
import secrets
|
||||||
|
|
||||||
# Make a new salt on every new password and store it with the password
|
# Make a new salt on every new password and store it with the password
|
||||||
@@ -104,9 +121,10 @@ class ValidateContentFetcherIsReady(object):
|
|||||||
self.message = message
|
self.message = message
|
||||||
|
|
||||||
def __call__(self, form, field):
|
def __call__(self, form, field):
|
||||||
from changedetectionio import content_fetcher
|
|
||||||
import urllib3.exceptions
|
import urllib3.exceptions
|
||||||
|
|
||||||
|
from changedetectionio import content_fetcher
|
||||||
|
|
||||||
# Better would be a radiohandler that keeps a reference to each class
|
# Better would be a radiohandler that keeps a reference to each class
|
||||||
if field.data is not None:
|
if field.data is not None:
|
||||||
klass = getattr(content_fetcher, field.data)
|
klass = getattr(content_fetcher, field.data)
|
||||||
@@ -213,52 +231,69 @@ class ValidateListRegex(object):
|
|||||||
except re.error:
|
except re.error:
|
||||||
message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
|
message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
|
||||||
raise ValidationError(message % (line))
|
raise ValidationError(message % (line))
|
||||||
|
|
||||||
class ValidateCSSJSONXPathInput(object):
|
class ValidateCSSJSONXPATHInput(object):
|
||||||
"""
|
"""
|
||||||
Filter validation
|
Filter validation
|
||||||
@todo CSS validator ;)
|
@todo CSS validator ;)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, message=None):
|
def __init__(self, message=None, allow_xpath=True, allow_json=True):
|
||||||
self.message = message
|
self.message = message
|
||||||
|
self.allow_xpath = allow_xpath
|
||||||
|
self.allow_json = allow_json
|
||||||
|
|
||||||
def __call__(self, form, field):
|
def __call__(self, form, field):
|
||||||
|
|
||||||
|
if isinstance(field.data, str):
|
||||||
|
data = [field.data]
|
||||||
|
else:
|
||||||
|
data = field.data
|
||||||
|
|
||||||
|
for line in data:
|
||||||
# Nothing to see here
|
# Nothing to see here
|
||||||
if not len(field.data.strip()):
|
if not len(line.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
# Does it look like XPath?
|
# Does it look like XPath?
|
||||||
if field.data.strip()[0] == '/':
|
if line.strip()[0] == '/':
|
||||||
from lxml import html, etree
|
if not self.allow_xpath:
|
||||||
tree = html.fromstring("<html></html>")
|
raise ValidationError("XPath not permitted in this field!")
|
||||||
|
from lxml import etree, html
|
||||||
|
tree = html.fromstring("<html></html>")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tree.xpath(field.data.strip())
|
tree.xpath(line.strip())
|
||||||
except etree.XPathEvalError as e:
|
except etree.XPathEvalError as e:
|
||||||
message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
|
message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
|
||||||
raise ValidationError(message % (field.data, str(e)))
|
raise ValidationError(message % (line, str(e)))
|
||||||
except:
|
except:
|
||||||
raise ValidationError("A system-error occurred when validating your XPath expression")
|
raise ValidationError("A system-error occurred when validating your XPath expression")
|
||||||
|
|
||||||
if 'json:' in field.data:
|
if 'json:' in line:
|
||||||
from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
|
if not self.allow_json:
|
||||||
from jsonpath_ng.ext import parse
|
raise ValidationError("JSONPath not permitted in this field!")
|
||||||
|
|
||||||
input = field.data.replace('json:', '')
|
from jsonpath_ng.exceptions import (
|
||||||
|
JsonPathLexerError,
|
||||||
|
JsonPathParserError,
|
||||||
|
)
|
||||||
|
from jsonpath_ng.ext import parse
|
||||||
|
|
||||||
try:
|
input = line.replace('json:', '')
|
||||||
parse(input)
|
|
||||||
except (JsonPathParserError, JsonPathLexerError) as e:
|
|
||||||
message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
|
|
||||||
raise ValidationError(message % (input, str(e)))
|
|
||||||
except:
|
|
||||||
raise ValidationError("A system-error occurred when validating your JSONPath expression")
|
|
||||||
|
|
||||||
# Re #265 - maybe in the future fetch the page and offer a
|
try:
|
||||||
# warning/notice that its possible the rule doesnt yet match anything?
|
parse(input)
|
||||||
|
except (JsonPathParserError, JsonPathLexerError) as e:
|
||||||
|
message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
|
||||||
|
raise ValidationError(message % (input, str(e)))
|
||||||
|
except:
|
||||||
|
raise ValidationError("A system-error occurred when validating your JSONPath expression")
|
||||||
|
|
||||||
|
# Re #265 - maybe in the future fetch the page and offer a
|
||||||
|
# warning/notice that its possible the rule doesnt yet match anything?
|
||||||
|
|
||||||
|
|
||||||
class quickWatchForm(Form):
|
class quickWatchForm(Form):
|
||||||
# https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
|
# https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
|
||||||
# `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
|
# `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
|
||||||
@@ -282,7 +317,8 @@ class watchForm(commonSettingsForm):
|
|||||||
|
|
||||||
minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
|
minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
|
||||||
[validators.Optional(), validators.NumberRange(min=1)])
|
[validators.Optional(), validators.NumberRange(min=1)])
|
||||||
css_filter = StringField('CSS/JSON/XPath Filter', [ValidateCSSJSONXPathInput()])
|
css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
|
||||||
|
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
|
||||||
title = StringField('Title')
|
title = StringField('Title')
|
||||||
|
|
||||||
ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
||||||
@@ -314,5 +350,6 @@ class globalSettingsForm(commonSettingsForm):
|
|||||||
[validators.NumberRange(min=1)])
|
[validators.NumberRange(min=1)])
|
||||||
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
|
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
|
||||||
base_url = StringField('Base URL', validators=[validators.Optional()])
|
base_url = StringField('Base URL', validators=[validators.Optional()])
|
||||||
|
global_subtractive_selectors = StringListField('Ignore elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
|
||||||
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
||||||
ignore_whitespace = BooleanField('Ignore whitespace')
|
ignore_whitespace = BooleanField('Ignore whitespace')
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from jsonpath_ng.ext import parse
|
from jsonpath_ng.ext import parse
|
||||||
import re
|
|
||||||
|
|
||||||
class JSONNotFound(ValueError):
|
class JSONNotFound(ValueError):
|
||||||
def __init__(self, msg):
|
def __init__(self, msg):
|
||||||
@@ -16,11 +19,22 @@ def css_filter(css_filter, html_content):
|
|||||||
|
|
||||||
return html_block + "\n"
|
return html_block + "\n"
|
||||||
|
|
||||||
|
def subtractive_css_selector(css_selector, html_content):
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
for item in soup.select(css_selector):
|
||||||
|
item.decompose()
|
||||||
|
return str(soup)
|
||||||
|
|
||||||
|
|
||||||
|
def element_removal(selectors: List[str], html_content):
|
||||||
|
"""Joins individual filters into one css filter."""
|
||||||
|
selector = ",".join(selectors)
|
||||||
|
return subtractive_css_selector(selector, html_content)
|
||||||
|
|
||||||
|
|
||||||
# Return str Utf-8 of matched rules
|
# Return str Utf-8 of matched rules
|
||||||
def xpath_filter(xpath_filter, html_content):
|
def xpath_filter(xpath_filter, html_content):
|
||||||
from lxml import html
|
from lxml import etree, html
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
tree = html.fromstring(html_content)
|
tree = html.fromstring(html_content)
|
||||||
html_block = ""
|
html_block = ""
|
||||||
@@ -151,4 +165,4 @@ def strip_ignore_text(content, wordlist, mode="content"):
|
|||||||
if mode == "line numbers":
|
if mode == "line numbers":
|
||||||
return ignored_line_numbers
|
return ignored_line_numbers
|
||||||
|
|
||||||
return "\n".encode('utf8').join(output)
|
return "\n".encode('utf8').join(output)
|
||||||
|
|||||||
@@ -1,15 +1,19 @@
|
|||||||
from os import unlink, path, mkdir
|
|
||||||
import json
|
import json
|
||||||
import uuid as uuid_builder
|
|
||||||
from threading import Lock
|
|
||||||
from copy import deepcopy
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import time
|
|
||||||
import threading
|
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import uuid as uuid_builder
|
||||||
|
from copy import deepcopy
|
||||||
|
from os import mkdir, path, unlink
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
|
from changedetectionio.notification import (
|
||||||
|
default_notification_body,
|
||||||
|
default_notification_format,
|
||||||
|
default_notification_title,
|
||||||
|
)
|
||||||
|
|
||||||
from changedetectionio.notification import default_notification_format, default_notification_body, default_notification_title
|
|
||||||
|
|
||||||
# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
|
# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
|
||||||
# Open a github issue if you know something :)
|
# Open a github issue if you know something :)
|
||||||
@@ -46,6 +50,7 @@ class ChangeDetectionStore:
|
|||||||
'extract_title_as_title': False,
|
'extract_title_as_title': False,
|
||||||
'fetch_backend': 'html_requests',
|
'fetch_backend': 'html_requests',
|
||||||
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||||
|
'global_subtractive_selectors': [],
|
||||||
'ignore_whitespace': False,
|
'ignore_whitespace': False,
|
||||||
'notification_urls': [], # Apprise URL list
|
'notification_urls': [], # Apprise URL list
|
||||||
# Custom notification content
|
# Custom notification content
|
||||||
@@ -82,6 +87,7 @@ class ChangeDetectionStore:
|
|||||||
'notification_body': default_notification_body,
|
'notification_body': default_notification_body,
|
||||||
'notification_format': default_notification_format,
|
'notification_format': default_notification_format,
|
||||||
'css_filter': "",
|
'css_filter': "",
|
||||||
|
'subtractive_selectors': [],
|
||||||
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
||||||
'fetch_backend': None,
|
'fetch_backend': None,
|
||||||
'extract_title_as_title': False
|
'extract_title_as_title': False
|
||||||
@@ -144,8 +150,8 @@ class ChangeDetectionStore:
|
|||||||
unlink(password_reset_lockfile)
|
unlink(password_reset_lockfile)
|
||||||
|
|
||||||
if not 'app_guid' in self.__data:
|
if not 'app_guid' in self.__data:
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
|
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
|
||||||
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
|
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
|
||||||
else:
|
else:
|
||||||
@@ -430,6 +436,7 @@ class ChangeDetectionStore:
|
|||||||
index.append(self.data['watching'][uuid]['history'][str(id)])
|
index.append(self.data['watching'][uuid]['history'][str(id)])
|
||||||
|
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
# Only in the sub-directories
|
# Only in the sub-directories
|
||||||
for item in pathlib.Path(self.datastore_path).rglob("*/*txt"):
|
for item in pathlib.Path(self.datastore_path).rglob("*/*txt"):
|
||||||
if not str(item) in index:
|
if not str(item) in index:
|
||||||
|
|||||||
@@ -122,7 +122,18 @@ User-Agent: wonderbra 1.0") }}
|
|||||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
<fieldset class="pure-group">
|
||||||
|
{{ render_field(form.subtractive_selectors, rows=5, placeholder="header
|
||||||
|
footer
|
||||||
|
nav
|
||||||
|
.stockticker") }}
|
||||||
|
<span class="pure-form-message-inline">
|
||||||
|
<ul>
|
||||||
|
<li> Remove HTML element(s) by CSS selector before text conversion. </li>
|
||||||
|
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
|
||||||
|
</ul>
|
||||||
|
</span>
|
||||||
|
</fieldset>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
<fieldset class="pure-group">
|
<fieldset class="pure-group">
|
||||||
{{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
|
{{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
|
||||||
|
|||||||
@@ -83,7 +83,18 @@
|
|||||||
</span>
|
</span>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
<fieldset class="pure-group">
|
||||||
|
{{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
|
||||||
|
footer
|
||||||
|
nav
|
||||||
|
.stockticker") }}
|
||||||
|
<span class="pure-form-message-inline">
|
||||||
|
<ul>
|
||||||
|
<li> Remove HTML element(s) by CSS selector before text conversion. </li>
|
||||||
|
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
|
||||||
|
</ul>
|
||||||
|
</span>
|
||||||
|
</fieldset>
|
||||||
<fieldset class="pure-group">
|
<fieldset class="pure-group">
|
||||||
{{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line
|
{{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line
|
||||||
/some.regex\d{2}/ for case-INsensitive regex
|
/some.regex\d{2}/ for case-INsensitive regex
|
||||||
|
|||||||
168
changedetectionio/tests/test_element_removal.py
Normal file
168
changedetectionio/tests/test_element_removal.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from flask import url_for
|
||||||
|
|
||||||
|
from ..html_tools import *
|
||||||
|
from .util import live_server_setup
|
||||||
|
|
||||||
|
|
||||||
|
def test_setup(live_server):
|
||||||
|
live_server_setup(live_server)
|
||||||
|
|
||||||
|
|
||||||
|
def set_original_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<header>
|
||||||
|
<h2>Header</h2>
|
||||||
|
</header>
|
||||||
|
<nav>
|
||||||
|
<ul>
|
||||||
|
<li><a href="#">A</a></li>
|
||||||
|
<li><a href="#">B</a></li>
|
||||||
|
<li><a href="#">C</a></li>
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div id="changetext">Some text that will change</div>
|
||||||
|
</body>
|
||||||
|
<footer>
|
||||||
|
<p>Footer</p>
|
||||||
|
</footer>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
|
||||||
|
def set_modified_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<header>
|
||||||
|
<h2>Header changed</h2>
|
||||||
|
</header>
|
||||||
|
<nav>
|
||||||
|
<ul>
|
||||||
|
<li><a href="#">A changed</a></li>
|
||||||
|
<li><a href="#">B</a></li>
|
||||||
|
<li><a href="#">C</a></li>
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div id="changetext">Some text that changes</div>
|
||||||
|
</body>
|
||||||
|
<footer>
|
||||||
|
<p>Footer changed</p>
|
||||||
|
</footer>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_element_removal_output():
|
||||||
|
from changedetectionio import fetch_site_status
|
||||||
|
from inscriptis import get_text
|
||||||
|
|
||||||
|
# Check text with sub-parts renders correctly
|
||||||
|
content = """<html>
|
||||||
|
<header>
|
||||||
|
<h2>Header</h2>
|
||||||
|
</header>
|
||||||
|
<nav>
|
||||||
|
<ul>
|
||||||
|
<li><a href="#">A</a></li>
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>across multiple lines</p>
|
||||||
|
<div id="changetext">Some text that changes</div>
|
||||||
|
</body>
|
||||||
|
<footer>
|
||||||
|
<p>Footer</p>
|
||||||
|
</footer>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
html_blob = element_removal(
|
||||||
|
["header", "footer", "nav", "#changetext"], html_content=content
|
||||||
|
)
|
||||||
|
text = get_text(html_blob)
|
||||||
|
assert (
|
||||||
|
text
|
||||||
|
== """Some initial text
|
||||||
|
|
||||||
|
across multiple lines
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_element_removal_full(client, live_server):
|
||||||
|
sleep_time_for_fetch_thread = 3
|
||||||
|
|
||||||
|
set_original_response()
|
||||||
|
|
||||||
|
# Give the endpoint time to spin up
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for("test_endpoint", _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
# Goto the edit page, add the filter data
|
||||||
|
# Not sure why \r needs to be added - absent of the #changetext this is not necessary
|
||||||
|
subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext"
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={
|
||||||
|
"subtractive_selectors": subtractive_selectors_data,
|
||||||
|
"url": test_url,
|
||||||
|
"tag": "",
|
||||||
|
"headers": "",
|
||||||
|
"fetch_backend": "html_requests",
|
||||||
|
},
|
||||||
|
follow_redirects=True,
|
||||||
|
)
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
|
||||||
|
# Check it saved
|
||||||
|
res = client.get(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
)
|
||||||
|
assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# No change yet - first check
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b"unviewed" not in res.data
|
||||||
|
|
||||||
|
# Make a change to header/footer/nav
|
||||||
|
set_modified_response()
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# There should not be an unviewed change, as changes should be removed
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b"unviewed" not in res.data
|
||||||
Reference in New Issue
Block a user