Compare commits

...

8 Commits

15 changed files with 204 additions and 126 deletions

View File

@@ -68,7 +68,7 @@ COPY changedetection.py /app/changedetection.py
# Github Action test purpose(test-only.yml).
# On production, it is effectively LOGGER_LEVEL=''.
ARG LOGGER_LEVEL=''
ENV LOGGER_LEVEL "$LOGGER_LEVEL"
ENV LOGGER_LEVEL="$LOGGER_LEVEL"
WORKDIR /app
CMD ["python", "./changedetection.py", "-d", "/datastore"]

View File

@@ -3,4 +3,6 @@
# Only exists for direct CLI usage
import changedetectionio
changedetectionio.main()
if __name__ == '__main__':
changedetectionio.main()

View File

@@ -104,6 +104,9 @@ def construct_blueprint(datastore: ChangeDetectionStore):
uuid = list(datastore.data['settings']['application']['tags'].keys()).pop()
default = datastore.data['settings']['application']['tags'].get(uuid)
if not default:
flash("Tag not found", "error")
return redirect(url_for('watchlist.index'))
form = group_restock_settings_form(
formdata=request.form if request.method == 'POST' else None,

View File

@@ -66,7 +66,7 @@
<div class="pure-control-group inline-radio">
{{ render_checkbox_field(form.notification_muted) }}
</div>
{% if is_html_webdriver %}
{% if 1 %}
<div class="pure-control-group inline-radio">
{{ render_checkbox_field(form.notification_screenshot) }}
<span class="pure-form-message-inline">

View File

@@ -213,9 +213,6 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
if request.method == 'POST' and not form.validate():
flash("An error occurred, please see below.", "error")
visualselector_data_is_ready = datastore.visualselector_data_is_ready(uuid)
# JQ is difficult to install on windows and must be manually added (outside requirements.txt)
jq_support = True
try:
@@ -225,11 +222,12 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
watch = datastore.data['watching'].get(uuid)
# if system or watch is configured to need a chrome type browser
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
watch_uses_webdriver = False
watch_needs_selenium_or_playwright = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
watch_uses_webdriver = True
watch_needs_selenium_or_playwright = True
from zoneinfo import available_timezones
@@ -247,14 +245,17 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False,
'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
'has_special_tag_options': _watch_has_tag_options_set(watch=watch),
'watch_uses_webdriver': watch_uses_webdriver,
'jq_support': jq_support,
'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False),
'settings_application': datastore.data['settings']['application'],
'system_has_playwright_configured': os.getenv('PLAYWRIGHT_DRIVER_URL'),
'system_has_webdriver_configured': os.getenv('WEBDRIVER_URL'),
'visual_selector_data_ready': datastore.visualselector_data_is_ready(watch_uuid=uuid),
'timezone_default_config': datastore.data['settings']['application'].get('timezone'),
'using_global_webdriver_wait': not default['webdriver_delay'],
'uuid': uuid,
'watch': watch
'watch': watch,
'watch_needs_selenium_or_playwright': watch_needs_selenium_or_playwright,
}
included_content = None

View File

@@ -94,11 +94,11 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
EXECUTE_DATA = {}
result = True
ruleset_settings = application_datastruct['watching'].get(current_watch_uuid)
watch = application_datastruct['watching'].get(current_watch_uuid)
if ruleset_settings.get("conditions"):
logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or"
complete_rules = filter_complete_rules(ruleset_settings['conditions'])
if watch and watch.get("conditions"):
logic_operator = "and" if watch.get("conditions_match_logic", "ALL") == "ALL" else "or"
complete_rules = filter_complete_rules(watch['conditions'])
if complete_rules:
# Give all plugins a chance to update the data dict again (that we will test the conditions against)
for plugin in plugin_manager.get_plugins():

View File

@@ -26,9 +26,11 @@ def capture_full_page(page):
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
screenshot_chunks = []
y = 0
# If page height is larger than current viewport, use a larger viewport for better capturing
if page_height > page.viewport_size['height']:
if page_height < step_size:
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
# Set viewport to a larger size to capture more content at once
page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
@@ -59,7 +61,10 @@ def capture_full_page(page):
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
# Explicit cleanup
del screenshot_chunks
del p
del parent_conn, child_conn
screenshot_chunks = None
return screenshot
@@ -286,12 +291,28 @@ class fetcher(Fetcher):
pass
# Clean up resources properly
context.close()
context = None
try:
self.page.request_gc()
except:
pass
self.page.close()
try:
self.page.close()
except:
pass
self.page = None
browser.close()
borwser = None
try:
context.close()
except:
pass
context = None
try:
browser.close()
except:
pass
browser = None

View File

@@ -46,9 +46,10 @@ async def capture_full_page(page):
screenshot_chunks = []
y = 0
if page_height > page.viewport['height']:
if page_height < step_size:
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
await page.evaluate(f"window.scrollTo(0, {y})")
screenshot_chunks.append(await page.screenshot(type_='jpeg',

View File

@@ -65,7 +65,17 @@ class fetcher(Fetcher):
# request_body, request_method unused for now, until some magic in the future happens.
options = ChromeOptions()
options.add_argument("--headless")
# Load Chrome options from env
CHROME_OPTIONS = [
line.strip()
for line in os.getenv("CHROME_OPTIONS", "").strip().splitlines()
if line.strip()
]
for opt in CHROME_OPTIONS:
options.add_argument(opt)
if self.proxy:
options.proxy = self.proxy
@@ -80,7 +90,9 @@ class fetcher(Fetcher):
self.quit()
raise
self.driver.set_window_size(1280, 1024)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
self.driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
@@ -88,6 +100,7 @@ class fetcher(Fetcher):
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver

View File

@@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content)
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
@@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
conn.send(text_content)
conn.close()
return text_content
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):

View File

@@ -98,15 +98,13 @@
{% macro playwright_warning() %}
<p><strong>Error - Playwright support for Chrome based fetching is not enabled.</strong> Alternatively try our <a href="https://changedetection.io">very affordable subscription based service which has all this setup for you</a>.</p>
<p><strong>Error - This watch needs Chrome (with playwright/sockpuppetbrowser), but Chrome based fetching is not enabled.</strong> Alternatively try our <a href="https://changedetection.io">very affordable subscription based service which has all this setup for you</a>.</p>
<p>You may need to <a href="https://github.com/dgtlmoon/changedetection.io/blob/09ebc6ec6338545bdd694dc6eee57f2e9d2b8075/docker-compose.yml#L31">Enable playwright environment variable</a> and uncomment the <strong>sockpuppetbrowser</strong> in the <a href="https://github.com/dgtlmoon/changedetection.io/blob/master/docker-compose.yml">docker-compose.yml</a> file.</p>
<br>
<p>(Also Selenium/WebDriver can not extract full page screenshots reliably so Playwright is recommended here)</p>
{% endmacro %}
{% macro only_webdriver_type_watches_warning() %}
<p><strong>Sorry, this functionality only works with Playwright/Chrome enabled watches.<br>You need to <a href="#request">Set the fetch method to Playwright/Chrome mode and resave</a> and have the Playwright connection enabled.</strong></p><br>
{% macro only_playwright_type_watches_warning() %}
<p><strong>Sorry, this functionality only works with Playwright/Chrome enabled watches.<br>You need to <a href="#request">Set the fetch method to Playwright/Chrome mode and resave</a> and have the SockpuppetBrowser/Playwright or Selenium enabled.</strong></p><br>
{% endmacro %}
{% macro render_time_schedule_form(form, available_timezones, timezone_default_config) %}

View File

@@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_field, render_checkbox_field, render_button, render_time_schedule_form, playwright_warning, only_webdriver_type_watches_warning, render_conditions_fieldlist_of_formfields_as_table %}
{% from '_helpers.html' import render_field, render_checkbox_field, render_button, render_time_schedule_form, playwright_warning, only_playwright_type_watches_warning, render_conditions_fieldlist_of_formfields_as_table %}
{% from '_common_fields.html' import render_common_settings_form %}
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='vis.js')}}" defer></script>
@@ -204,7 +204,9 @@ Math: {{ 1 + 1 }}") }}
</div>
<div class="tab-pane-inner" id="browser-steps">
{% if playwright_enabled and watch_uses_webdriver %}
{% if watch_needs_selenium_or_playwright %}
{# Only works with playwright #}
{% if system_has_playwright_configured %}
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality">
<fieldset>
<div class="pure-control-group">
@@ -223,7 +225,6 @@ Math: {{ 1 + 1 }}") }}
<div class="flex-wrapper" >
<div id="browser-steps-ui" class="noselect">
<div class="noselect" id="browsersteps-selector-wrapper" style="width: 100%">
<span class="loader" >
<span id="browsersteps-click-start">
@@ -245,15 +246,16 @@ Math: {{ 1 + 1 }}") }}
</div>
</fieldset>
{% else %}
<span class="pure-form-message-inline">
{% if not watch_uses_webdriver %}
{{ only_webdriver_type_watches_warning() }}
{% endif %}
{% if not playwright_enabled %}
{{ playwright_warning() }}
{% endif %}
</span>
{# it's configured to use selenium or chrome but system says its not configured #}
{{ playwright_warning() }}
{% if system_has_webdriver_configured %}
<strong>Selenium/Webdriver cant be used here because it wont fetch screenshots reliably.</strong>
{% endif %}
{% endif %}
{% else %}
{# "This functionality needs chrome.." #}
{{ only_playwright_type_watches_warning() }}
{% endif %}
</div>
@@ -262,7 +264,7 @@ Math: {{ 1 + 1 }}") }}
<div class="pure-control-group inline-radio">
{{ render_checkbox_field(form.notification_muted) }}
</div>
{% if watch_uses_webdriver %}
{% if watch_needs_selenium_or_playwright %}
<div class="pure-control-group inline-radio">
{{ render_checkbox_field(form.notification_screenshot) }}
<span class="pure-form-message-inline">
@@ -379,7 +381,9 @@ Math: {{ 1 + 1 }}") }}
<fieldset>
<div class="pure-control-group">
{% if playwright_enabled and watch_uses_webdriver %}
{% if watch_needs_selenium_or_playwright %}
{% if system_has_playwright_configured %}
{% if visual_selector_data_ready %}
<span class="pure-form-message-inline" id="visual-selector-heading">
The Visual Selector tool lets you select the <i>text</i> elements that will be used for the change detection. It automatically fills-in the filters in the "CSS/JSONPath/JQ/XPath Filters" box of the <a href="#filters-and-triggers">Filters & Triggers</a> tab. Use <strong>Shift+Click</strong> to select multiple items.
</span>
@@ -396,14 +400,20 @@ Math: {{ 1 + 1 }}") }}
<canvas id="selector-canvas"></canvas>
</div>
<div id="selector-current-xpath" style="overflow-x: hidden"><strong>Currently:</strong>&nbsp;<span class="text">Loading...</span></div>
{% else %}
<strong>Error, The Visual selector data is not ready, it needs to complete atleast one fetch, please queue the item and reload.</strong>
{% endif %}
{% else %}
{% if not watch_uses_webdriver %}
{{ only_webdriver_type_watches_warning() }}
{% endif %}
{% if not playwright_enabled %}
{{ playwright_warning() }}
{% endif %}
{# The watch needed chrome but system says that playwright is not ready #}
{{ playwright_warning() }}
{% endif %}
{% if system_has_webdriver_configured %}
<strong>Selenium/Webdriver cant be used here because it wont fetch screenshots reliably.</strong>
{% endif %}
{% else %}
{# "This functionality needs chrome.." #}
{{ only_playwright_type_watches_warning() }}
{% endif %}
</div>
</fieldset>
</div>

View File

@@ -9,15 +9,20 @@ services:
# - ./proxies.json:/datastore/proxies.json
# environment:
# Default listening port, can also be changed with the -p option
# Default listening port, can also be changed with the -p option (not to be confused with ports: below)
# - PORT=5000
#
# Log levels are in descending order. (TRACE is the most detailed one)
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=TRACE
#
# Alternative WebDriver/selenium URL, do not use "'s or 's!
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
#
# Uncomment below and the "sockpuppetbrowser" to use a real Chrome browser (It uses the "playwright" protocol)
# - PLAYWRIGHT_DRIVER_URL=ws://browser-sockpuppet-chrome:3000
#
#
# Alternative WebDriver/selenium URL, do not use "'s or 's! (old, deprecated, does not support screenshots very well)
# - WEBDRIVER_URL=http://browser-selenium-chrome:4444/wd/hub
#
# WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_noProxy,
# webdriver_proxyAutoconfigUrl, webdriver_autodetect,
@@ -25,9 +30,6 @@ services:
#
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
#
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
#
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
#
@@ -43,7 +45,7 @@ services:
# Base URL of your changedetection.io install (Added to the notification alert)
# - BASE_URL=https://mysite.com
# Respect proxy_pass type settings, `proxy_set_header Host "localhost";` and `proxy_set_header X-Forwarded-Prefix /app;`
# More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy-sub-directory
# More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy
# - USE_X_SETTINGS=1
#
# Hides the `Referer` header so that monitored websites can't see the changedetection.io hostname.
@@ -86,8 +88,8 @@ services:
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
# RECOMMENDED FOR FETCHING PAGES WITH CHROME, be sure to enable the "PLAYWRIGHT_DRIVER_URL" env variable in the main changedetection container
# sockpuppetbrowser:
# hostname: sockpuppetbrowser
# browser-sockpuppet-chrome:
# hostname: browser-sockpuppet-chrome
# image: dgtlmoon/sockpuppetbrowser:latest
# cap_add:
# - SYS_ADMIN
@@ -102,14 +104,18 @@ services:
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
# Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
# Does not report status codes (200, 404, 403) and other issues
# browser-chrome:
# hostname: browser-chrome
# browser-selenium-chrome:
# hostname: browser-selenium-chrome
# image: selenium/standalone-chrome:4
# environment:
# - VNC_NO_PASSWORD=1
# - SCREEN_WIDTH=1920
# - SCREEN_HEIGHT=1080
# - SCREEN_DEPTH=24
# CHROME_OPTIONS: |
# --window-size=1280,1024
# --headless
# --disable-gpu
# volumes:
# # Workaround to avoid the browser crashing inside a docker container
# # See https://github.com/SeleniumHQ/docker-selenium#quick-start

File diff suppressed because one or more lines are too long

View File

@@ -5,13 +5,13 @@
<meta name="description" content="Manage your changedetection.io watches via API, requires the `x-api-key` header which is found in the settings UI.">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<link href="assets/bootstrap.min.css?v=1701595483622" rel="stylesheet" media="screen">
<link href="assets/prism.css?v=1701595483622" rel="stylesheet" />
<link href="assets/main.css?v=1701595483622" rel="stylesheet" media="screen, print">
<link href="assets/favicon.ico?v=1701595483622" rel="icon" type="image/x-icon">
<link href="assets/apple-touch-icon.png?v=1701595483622" rel="apple-touch-icon" sizes="180x180">
<link href="assets/favicon-32x32.png?v=1701595483622" rel="icon" type="image/png" sizes="32x32">
<link href="assets/favicon-16x16.png?v=1701595483622" rel="icon" type="image/png" sizes="16x16">
<link href="assets/bootstrap.min.css?v=1744573753999" rel="stylesheet" media="screen">
<link href="assets/prism.css?v=1744573753999" rel="stylesheet" />
<link href="assets/main.css?v=1744573753999" rel="stylesheet" media="screen, print">
<link href="assets/favicon.ico?v=1744573753999" rel="icon" type="image/x-icon">
<link href="assets/apple-touch-icon.png?v=1744573753999" rel="apple-touch-icon" sizes="180x180">
<link href="assets/favicon-32x32.png?v=1744573753999" rel="icon" type="image/png" sizes="32x32">
<link href="assets/favicon-16x16.png?v=1744573753999" rel="icon" type="image/png" sizes="16x16">
</head>
<body class="container-fluid">
@@ -928,6 +928,6 @@
</div>
</div>
<script src="assets/main.bundle.js?v=1701595483622"></script>
<script src="assets/main.bundle.js?v=1744573753999"></script>
</body>
</html>