Compare commits

...

20 Commits

Author SHA1 Message Date
dgtlmoon
60cefca3d7 Not needed 2022-07-22 18:58:42 +02:00
dgtlmoon
f6d64445bf Small cleanups 2022-07-22 18:55:18 +02:00
dgtlmoon
872bbba71c Notifications - email - Correctly send plaintext notification email with plaintext header (#767) 2022-07-21 15:22:20 +02:00
Jonathon Sisson
d578de1a35 Form text tweak - Regex clarification (#766) 2022-07-21 10:05:59 +02:00
dgtlmoon
cdc104be10 Update README.md 2022-07-20 14:37:45 +02:00
dgtlmoon
dd0eeca056 Handle simple obfuscations - HomeDepot.com style price obfuscation (#764) 2022-07-20 14:02:22 +02:00
dgtlmoon
a95468be08 Fixing docker-compose.yml PLAYWRIGHT_DRIVER_URL example URL 2022-07-15 20:45:29 +02:00
Brandon Wees
ace44d0e00 Notifications fix - Discord - added discord webhook base url to truncation rules (#753)
Co-authored-by: bwees <branonwees@gmail.com>
2022-07-14 17:41:12 +02:00
dgtlmoon
ebb8b88621 Update Playwright URI Env example with stealth setting and CORS workaround (more reliable fetching) 2022-07-12 22:36:22 +02:00
dgtlmoon
12fc2200de remove extra file 2022-07-12 22:32:20 +02:00
dgtlmoon
52d3d375ba removing package-lock.json - not required to be in git 2022-07-10 20:29:11 +02:00
dgtlmoon
08117089e6 Share-icon cleanups 2022-07-10 20:24:49 +02:00
dgtlmoon
2ba3a6d53f Test improvement: Extract text should return all matches 2022-07-10 20:05:48 +02:00
dgtlmoon
2f636553a9 Bug fix: RSS Feed should also announce utf-8 charset 2022-07-10 18:50:21 +02:00
Freddie Leeman
0bde48b282 Regex extract filter: Return all regex results instead of first match (#730) 2022-07-10 15:09:10 +02:00
dgtlmoon
fae1164c0b Ability to specify JS before running change-detection (#744) 2022-07-10 13:56:01 +02:00
dgtlmoon
169c293143 Playwright - log console errors to output 2022-07-10 13:55:29 +02:00
dgtlmoon
46cb5cff66 UI Improvement - Clarifying "Visual Filter" tool as "Visual Selector Filter" 2022-07-10 12:51:12 +02:00
Simo Elalj
05584ea886 Use environment variables to override new watch settings defaults (user-agent, timeout, workers) (#742) 2022-07-08 20:50:04 +02:00
marvin8
176a591357 Update docker-compose.yml - Remove duplicate environment variables from playwright-chrome sample config in docker-compose.yml (#738) 2022-07-06 09:03:10 +02:00
19 changed files with 146 additions and 3759 deletions

View File

@@ -3,9 +3,9 @@
![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master)
## Self-Hosted, Open Source, Change Monitoring of Web Pages
## Web Site Change Detection, Monitoring and Notification - Self-Hosted or SaaS.
_Know when web pages change! Stay ontop of new information!_
_Know when web pages change! Stay ontop of new information! get notifications when important website content changes_
Live your data-life *pro-actively* instead of *re-actively*.

View File

@@ -361,7 +361,7 @@ def changedetection_app(config=None, datastore_o=None):
fe.pubDate(dt)
response = make_response(fg.rss_str())
response.headers.set('Content-Type', 'application/rss+xml')
response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8')
return response
@app.route("/", methods=['GET'])

View File

@@ -46,6 +46,7 @@ class Fetcher():
headers = None
fetcher_description = "No description"
webdriver_js_execute_code = None
xpath_element_js = """
// Include the getXpath script directly, easier than fetching
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
@@ -175,7 +176,6 @@ class Fetcher():
# Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = False
fetcher_description = "No description"
system_http_proxy = os.getenv('HTTP_PROXY')
system_https_proxy = os.getenv('HTTPS_PROXY')
@@ -309,13 +309,19 @@ class base_html_playwright(Fetcher):
page.set_default_navigation_timeout(90000)
page.set_default_timeout(90000)
# Bug - never set viewport size BEFORE page.goto
# Listen for all console events and handle errors
page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Bug - never set viewport size BEFORE page.goto
# Waits for the next navigation. Using Python context manager
# prevents a race condition between clicking and waiting for a navigation.
with page.expect_navigation():
response = page.goto(url, wait_until='load')
if self.webdriver_js_execute_code is not None:
page.evaluate(self.webdriver_js_execute_code)
except playwright._impl._api_types.TimeoutError as e:
context.close()
browser.close()
@@ -447,6 +453,12 @@ class base_html_webdriver(Fetcher):
self.driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.screenshot = self.driver.get_screenshot_as_png()
# @todo - how to check this? is it possible?

View File

@@ -106,6 +106,9 @@ class perform_site_check():
elif system_webdriver_delay is not None:
fetcher.render_extract_delay = system_webdriver_delay
if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip():
fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code']
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code, watch['css_filter'])
fetcher.quit()
@@ -147,7 +150,9 @@ class perform_site_check():
is_html = False
if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content)
html_content = fetcher.content
# If not JSON, and if it's not text/plain..
@@ -213,7 +218,7 @@ class perform_site_check():
result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
flags=re.MULTILINE | re.DOTALL | re.LOCALE)
if result:
regex_matched_output.append(result[0])
regex_matched_output = regex_matched_output + result
if regex_matched_output:
stripped_text_from_html = b'\n'.join(regex_matched_output)

View File

@@ -344,6 +344,8 @@ class watchForm(commonSettingsForm):
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
proxy = RadioField('Proxy')

View File

@@ -202,3 +202,17 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
return text_content
def workarounds_for_obfuscations(content):
"""
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
This could go into its own Pip package in the future, for faster updates
"""
# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
# https://github.com/weblyzard/inscriptis/issues/45
if not content:
return content
content = re.sub('<!--\s+-->', '', content)
return content

View File

@@ -1,8 +1,4 @@
import collections
import os
import uuid as uuid_builder
from os import getenv
from changedetectionio.notification import (
default_notification_body,
default_notification_format,
@@ -15,16 +11,16 @@ class model(dict):
'watching': {},
'settings': {
'headers': {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'User-Agent': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet.
'Accept-Language': 'en-GB,en-US;q=0.9,en;'
},
'requests': {
'timeout': 15, # Default 15 seconds
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
'jitter_seconds': 0,
'workers': 10, # Number of threads, lower is better for slow connections
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
'proxy': None # Preferred proxy connection
},
'application': {
@@ -33,7 +29,7 @@ class model(dict):
'base_url' : None,
'extract_title_as_title': False,
'empty_pages_are_a_change': False,
'fetch_backend': os.getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [],
'ignore_whitespace': True,

View File

@@ -2,6 +2,7 @@ import os
import uuid as uuid_builder
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
from changedetectionio.notification import (
default_notification_body,
@@ -47,10 +48,11 @@ class model(dict):
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'webdriver_delay': None
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
}
jitter_seconds = 0
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
def __init__(self, *arg, **kw):
import uuid
self.update(self.__base_config)
@@ -159,7 +161,7 @@ class model(dict):
def threshold_seconds(self):
seconds = 0
for m, n in self.mtable.items():
for m, n in mtable.items():
x = self.get('time_between_check', {}).get(m, None)
if x:
seconds += x * n

View File

@@ -64,7 +64,7 @@ def process_notification(n_object, datastore):
# So if no avatar_url is specified, add one so it can be correctly calculated into the total payload
k = '?' if not '?' in url else '&'
if not 'avatar_url' in url:
if not 'avatar_url' in url and not url.startswith('mail'):
url += k + 'avatar_url=https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/changedetectionio/static/images/avatar-256x256.png'
if url.startswith('tgram://'):
@@ -79,13 +79,20 @@ def process_notification(n_object, datastore):
n_title = n_title[0:payload_max_size]
n_body = n_body[0:body_limit]
elif url.startswith('discord://'):
elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks'):
# real limit is 2000, but minus some for extra metadata
payload_max_size = 1700
body_limit = max(0, payload_max_size - len(n_title))
n_title = n_title[0:payload_max_size]
n_body = n_body[0:body_limit]
elif url.startswith('mailto'):
# Apprise will default to HTML, so we need to override it
# So that whats' generated in n_body is in line with what is going to be sent.
# https://github.com/caronc/apprise/issues/633#issuecomment-1191449321
if not 'format=' in url and (n_format == 'text' or n_format == 'markdown'):
url = "{}?format={}".format(url, n_format)
apobj.add(url)
apobj.notify(

View File

@@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
width="18"
height="19.92"
viewBox="0 0 18 19.92"
version="1.1"
id="svg6"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<defs
id="defs10" />
<path
d="M -3,-2 H 21 V 22 H -3 Z"
fill="none"
id="path2" />
<path
d="m 15,14.08 c -0.76,0 -1.44,0.3 -1.96,0.77 L 5.91,10.7 C 5.96,10.47 6,10.24 6,10 6,9.76 5.96,9.53 5.91,9.3 L 12.96,5.19 C 13.5,5.69 14.21,6 15,6 16.66,6 18,4.66 18,3 18,1.34 16.66,0 15,0 c -1.66,0 -3,1.34 -3,3 0,0.24 0.04,0.47 0.09,0.7 L 5.04,7.81 C 4.5,7.31 3.79,7 3,7 1.34,7 0,8.34 0,10 c 0,1.66 1.34,3 3,3 0.79,0 1.5,-0.31 2.04,-0.81 l 7.12,4.16 c -0.05,0.21 -0.08,0.43 -0.08,0.65 0,1.61 1.31,2.92 2.92,2.92 1.61,0 2.92,-1.31 2.92,-2.92 0,-1.61 -1.31,-2.92 -2.92,-2.92 z"
id="path4"
style="fill:#ffffff;fill-opacity:1" />
</svg>

After

Width:  |  Height:  |  Size: 892 B

View File

@@ -1 +1,3 @@
node_modules
package-lock.json

File diff suppressed because it is too large Load Diff

View File

@@ -158,8 +158,7 @@ class ChangeDetectionStore:
@property
def threshold_seconds(self):
seconds = 0
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
for m, n in mtable.items():
for m, n in Watch.mtable.items():
x = self.__data['settings']['requests']['time_between_check'].get(m)
if x:
seconds += x * n
@@ -298,7 +297,8 @@ class ChangeDetectionStore:
'ignore_text', 'css_filter',
'subtractive_selectors', 'trigger_text',
'extract_title_as_title', 'extract_text',
'text_should_not_be_present']:
'text_should_not_be_present',
'webdriver_js_execute_code']:
if res.get(k):
apply_extras[k] = res[k]

View File

@@ -25,7 +25,7 @@
<ul>
<li class="tab" id="default-tab"><a href="#general">General</a></li>
<li class="tab"><a href="#request">Request</a></li>
<li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Selector</a></li>
<li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Filter Selector</a></li>
<li class="tab"><a href="#filters-and-triggers">Filters &amp; Triggers</a></li>
<li class="tab"><a href="#notifications">Notifications</a></li>
</ul>
@@ -88,14 +88,17 @@
<strong>If you're having trouble waiting for the page to be fully rendered (text missing etc), try increasing the 'wait' time here.</strong>
<br/>
This will wait <i>n</i> seconds before extracting the text.
{% if using_global_webdriver_wait %}
<br/><strong>Using the current global default settings</strong>
{% endif %}
</div>
</div>
{% if using_global_webdriver_wait %}
<div class="pure-form-message-inline">
<strong>Using the current global default settings</strong>
<div class="pure-control-group">
{{ render_field(form.webdriver_js_execute_code) }}
<div class="pure-form-message-inline">
Run this code before performing change detection, handy for filling in fields and other actions <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More help and examples here</a>
</div>
</div>
{% endif %}
</fieldset>
<fieldset class="pure-group" id="requests-override-options">
{% if not playwright_enabled %}
@@ -187,7 +190,7 @@ nav
<span class="pure-form-message-inline">
<ul>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
<li>Use the preview/show current tab to see ignores</li>
</ul>
@@ -240,7 +243,7 @@ Unavailable") }}
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
<img id="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}">
<strong>Pro-tip:</strong> This tool is only for limiting which elements will be included on a change-detection, not for interacting with browser directly.
<fieldset>
<div class="pure-control-group">
{% if visualselector_enabled %}

View File

@@ -148,7 +148,7 @@ nav
<ul>
<li>Note: This is applied globally in addition to the per-watch rules.</li>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
<li>Use the preview/show current tab to see ignores</li>
</ul>

View File

@@ -14,7 +14,7 @@
{{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
<button type="submit" class="pure-button pure-button-primary">Watch</button>
</fieldset>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
</form>
<div>
<a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a>

View File

@@ -33,7 +33,7 @@ def set_modified_response():
</br>
So let's see what happens. </br>
<div id="sametext">Some text thats the same</div>
<div id="changetext">Some text that did change ( 1000 online <br/> 80 guests)</div>
<div id="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/> 2000 online )</div>
</body>
</html>
"""
@@ -119,9 +119,12 @@ def test_check_filter_and_regex_extract(client, live_server):
# Class will be blank for now because the frontend didnt apply the diff
assert b'<div class="">1000 online' in res.data
# All regex matching should be here
assert b'<div class="">2000 online' in res.data
# Both regexs should be here
assert b'<div class="">80 guests' in res.data
# Should not be here
assert b'Some text that did change' not in res.data
assert b'Some text that did change' not in res.data

View File

@@ -0,0 +1,43 @@
#!/usr/bin/python3
import time
from flask import url_for
from .util import live_server_setup
def set_original_ignore_response():
test_return_data = """<html>
<body>
<span>The price is</span><span>$<!-- -->90<!-- -->.<!-- -->74</span>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_obfuscations(client, live_server):
set_original_ignore_response()
live_server_setup(live_server)
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Give the thread time to pick it up
time.sleep(3)
# Check HTML conversion detected and workd
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'$90.74' in res.data

View File

@@ -24,7 +24,7 @@ services:
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
#
# Alternative Playwright URL, do not use "'s or 's!
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/?stealth=1&--disable-web-security=true
#
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
#
@@ -78,9 +78,6 @@ services:
# - SCREEN_HEIGHT=1024
# - SCREEN_DEPTH=16
# - ENABLE_DEBUGGER=false
# - SCREEN_WIDTH=1280
# - SCREEN_HEIGHT=1024
# - SCREEN_DEPTH=16
# - PREBOOT_CHROME=true
# - CONNECTION_TIMEOUT=300000
# - MAX_CONCURRENT_SESSIONS=10