Compare commits

...

11 Commits

26 changed files with 663 additions and 323 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.49.15'
__version__ = '0.49.17'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -168,9 +168,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
step_optional_value = request.form.get('optional_value')
is_last_step = strtobool(request.form.get('is_last_step'))
# @todo try.. accept.. nice errors not popups..
try:
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(action_name=step_operation,
selector=step_selector,
optional_value=step_optional_value)

View File

@@ -61,23 +61,6 @@ class steppable_browser_interface():
def __init__(self, start_url):
self.start_url = start_url
def safe_page_operation(self, operation_fn, default_return=None):
"""Safely execute a page operation with error handling"""
if self.page is None:
logger.warning("Attempted operation on None page object")
return default_return
try:
return operation_fn()
except Exception as e:
logger.debug(f"Page operation failed: {str(e)}")
# Try to reclaim memory if possible
try:
self.page.request_gc()
except:
pass
return default_return
# Convert and perform "Click Button" for example
def call_action(self, action_name, selector=None, optional_value=None):
@@ -109,20 +92,11 @@ class steppable_browser_interface():
if optional_value and ('{%' in optional_value or '{{' in optional_value):
optional_value = jinja_render(template_str=optional_value)
try:
action_handler(selector, optional_value)
# Safely wait for timeout
def wait_timeout():
self.page.wait_for_timeout(1.5 * 1000)
self.safe_page_operation(wait_timeout)
logger.debug(f"Call action done in {time.time()-now:.2f}s")
except Exception as e:
logger.error(f"Error executing action '{call_action_name}': {str(e)}")
# Request garbage collection to free up resources after error
try:
self.page.request_gc()
except:
pass
action_handler(selector, optional_value)
# Safely wait for timeout
self.page.wait_for_timeout(1.5 * 1000)
logger.debug(f"Call action done in {time.time()-now:.2f}s")
def action_goto_url(self, selector=None, value=None):
if not value:
@@ -130,11 +104,7 @@ class steppable_browser_interface():
return None
now = time.time()
def goto_operation():
return self.page.goto(value, timeout=0, wait_until='load')
response = self.safe_page_operation(goto_operation)
response = self.page.goto(value, timeout=0, wait_until='load')
logger.debug(f"Time to goto URL {time.time()-now:.2f}s")
return response
@@ -147,53 +117,40 @@ class steppable_browser_interface():
if not value or not len(value.strip()):
return
def click_operation():
elem = self.page.get_by_text(value)
if elem.count():
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
self.safe_page_operation(click_operation)
elem = self.page.get_by_text(value)
if elem.count():
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
def action_click_element_containing_text_if_exists(self, selector=None, value=''):
logger.debug("Clicking element containing text if exists")
if not value or not len(value.strip()):
return
def click_if_exists_operation():
elem = self.page.get_by_text(value)
logger.debug(f"Clicking element containing text - {elem.count()} elements found")
if elem.count():
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
elem = self.page.get_by_text(value)
logger.debug(f"Clicking element containing text - {elem.count()} elements found")
if elem.count():
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
self.safe_page_operation(click_if_exists_operation)
def action_enter_text_in_field(self, selector, value):
if not selector or not len(selector.strip()):
return
def fill_operation():
self.page.fill(selector, value, timeout=self.action_timeout)
self.safe_page_operation(fill_operation)
self.page.fill(selector, value, timeout=self.action_timeout)
def action_execute_js(self, selector, value):
if not value:
return None
def evaluate_operation():
return self.page.evaluate(value)
return self.safe_page_operation(evaluate_operation)
return self.page.evaluate(value)
def action_click_element(self, selector, value):
logger.debug("Clicking element")
if not selector or not len(selector.strip()):
return
def click_operation():
self.page.click(selector=selector, timeout=self.action_timeout + 20 * 1000, delay=randint(200, 500))
self.safe_page_operation(click_operation)
self.page.click(selector=selector, timeout=self.action_timeout + 20 * 1000, delay=randint(200, 500))
def action_click_element_if_exists(self, selector, value):
import playwright._impl._errors as _api_types
@@ -201,16 +158,14 @@ class steppable_browser_interface():
if not selector or not len(selector.strip()):
return
def click_if_exists_operation():
try:
self.page.click(selector, timeout=self.action_timeout, delay=randint(200, 500))
except _api_types.TimeoutError:
return
except _api_types.Error:
# Element was there, but page redrew and now its long long gone
return
try:
self.page.click(selector, timeout=self.action_timeout, delay=randint(200, 500))
except _api_types.TimeoutError:
return
except _api_types.Error:
# Element was there, but page redrew and now its long long gone
return
self.safe_page_operation(click_if_exists_operation)
def action_click_x_y(self, selector, value):
if not value or not re.match(r'^\s?\d+\s?,\s?\d+\s?$', value):
@@ -222,10 +177,8 @@ class steppable_browser_interface():
x = int(float(x.strip()))
y = int(float(y.strip()))
def click_xy_operation():
self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
self.safe_page_operation(click_xy_operation)
except Exception as e:
logger.error(f"Error parsing x,y coordinates: {str(e)}")
@@ -233,27 +186,17 @@ class steppable_browser_interface():
if not selector or not len(selector.strip()):
return
def select_operation():
self.page.select_option(selector, label=value, timeout=self.action_timeout)
self.safe_page_operation(select_operation)
self.page.select_option(selector, label=value, timeout=self.action_timeout)
def action_scroll_down(self, selector, value):
def scroll_operation():
# Some sites this doesnt work on for some reason
self.page.mouse.wheel(0, 600)
self.page.wait_for_timeout(1000)
self.safe_page_operation(scroll_operation)
# Some sites this doesnt work on for some reason
self.page.mouse.wheel(0, 600)
self.page.wait_for_timeout(1000)
def action_wait_for_seconds(self, selector, value):
try:
seconds = float(value.strip()) if value else 1.0
def wait_operation():
self.page.wait_for_timeout(seconds * 1000)
self.safe_page_operation(wait_operation)
self.page.wait_for_timeout(seconds * 1000)
except (ValueError, TypeError) as e:
logger.error(f"Invalid value for wait_for_seconds: {str(e)}")
@@ -263,14 +206,11 @@ class steppable_browser_interface():
import json
v = json.dumps(value)
def wait_for_text_operation():
self.page.wait_for_function(
f'document.querySelector("body").innerText.includes({v});',
timeout=30000
)
self.page.wait_for_function(
f'document.querySelector("body").innerText.includes({v});',
timeout=30000
)
self.safe_page_operation(wait_for_text_operation)
def action_wait_for_text_in_element(self, selector, value):
if not selector or not value:
@@ -280,82 +220,60 @@ class steppable_browser_interface():
s = json.dumps(selector)
v = json.dumps(value)
def wait_for_text_in_element_operation():
self.page.wait_for_function(
f'document.querySelector({s}).innerText.includes({v});',
timeout=30000
)
self.safe_page_operation(wait_for_text_in_element_operation)
self.page.wait_for_function(
f'document.querySelector({s}).innerText.includes({v});',
timeout=30000
)
# @todo - in the future make some popout interface to capture what needs to be set
# https://playwright.dev/python/docs/api/class-keyboard
def action_press_enter(self, selector, value):
def press_operation():
self.page.keyboard.press("Enter", delay=randint(200, 500))
self.page.keyboard.press("Enter", delay=randint(200, 500))
self.safe_page_operation(press_operation)
def action_press_page_up(self, selector, value):
def press_operation():
self.page.keyboard.press("PageUp", delay=randint(200, 500))
self.safe_page_operation(press_operation)
self.page.keyboard.press("PageUp", delay=randint(200, 500))
def action_press_page_down(self, selector, value):
def press_operation():
self.page.keyboard.press("PageDown", delay=randint(200, 500))
self.safe_page_operation(press_operation)
self.page.keyboard.press("PageDown", delay=randint(200, 500))
def action_check_checkbox(self, selector, value):
if not selector:
return
def check_operation():
self.page.locator(selector).check(timeout=self.action_timeout)
self.safe_page_operation(check_operation)
self.page.locator(selector).check(timeout=self.action_timeout)
def action_uncheck_checkbox(self, selector, value):
if not selector:
return
def uncheck_operation():
self.page.locator(selector).uncheck(timeout=self.action_timeout)
self.page.locator(selector).uncheck(timeout=self.action_timeout)
self.safe_page_operation(uncheck_operation)
def action_remove_elements(self, selector, value):
"""Removes all elements matching the given selector from the DOM."""
if not selector:
return
def remove_operation():
self.page.locator(selector).evaluate_all("els => els.forEach(el => el.remove())")
self.safe_page_operation(remove_operation)
self.page.locator(selector).evaluate_all("els => els.forEach(el => el.remove())")
def action_make_all_child_elements_visible(self, selector, value):
"""Recursively makes all child elements inside the given selector fully visible."""
if not selector:
return
def make_visible_operation():
self.page.locator(selector).locator("*").evaluate_all("""
els => els.forEach(el => {
el.style.display = 'block'; // Forces it to be displayed
el.style.visibility = 'visible'; // Ensures it's not hidden
el.style.opacity = '1'; // Fully opaque
el.style.position = 'relative'; // Avoids 'absolute' hiding
el.style.height = 'auto'; // Expands collapsed elements
el.style.width = 'auto'; // Ensures full visibility
el.removeAttribute('hidden'); // Removes hidden attribute
el.classList.remove('hidden', 'd-none'); // Removes common CSS hidden classes
})
""")
self.safe_page_operation(make_visible_operation)
self.page.locator(selector).locator("*").evaluate_all("""
els => els.forEach(el => {
el.style.display = 'block'; // Forces it to be displayed
el.style.visibility = 'visible'; // Ensures it's not hidden
el.style.opacity = '1'; // Fully opaque
el.style.position = 'relative'; // Avoids 'absolute' hiding
el.style.height = 'auto'; // Expands collapsed elements
el.style.width = 'auto'; // Ensures full visibility
el.removeAttribute('hidden'); // Removes hidden attribute
el.classList.remove('hidden', 'd-none'); // Removes common CSS hidden classes
})
""")
# Responsible for maintaining a live 'context' with the chrome CDP
# @todo - how long do contexts live for anyway?

View File

@@ -122,7 +122,11 @@
{% set mute_label = 'UnMute notification' if watch.notification_muted else 'Mute notification' %}
<a class="link-mute state-{{'on' if watch.notification_muted else 'off'}}" href="{{url_for('watchlist.index', op='mute', uuid=watch.uuid, tag=active_tag_uuid)}}"><img src="{{url_for('static_content', group='images', filename='bell-off.svg')}}" alt="{{ mute_label }}" title="{{ mute_label }}" class="icon icon-mute" ></a>
</td>
<td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
<td class="title-col inline">
{% if watch.get_screenshot() %}
<img class="thumbnail" src="{{url_for('static_content', group='thumbnail', filename=watch.uuid)}}" alt="thumbnail screenshot" title="thumbnail screenshot" >
{% endif %}
<span>{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}</span>
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
<a class="link-spread" href="{{url_for('ui.form_share_put_watch', uuid=watch.uuid)}}"><img src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" ></a>
@@ -137,13 +141,11 @@
{% if watch.has_browser_steps %}<img class="status-icon status-browsersteps" src="{{url_for('static_content', group='images', filename='steps.svg')}}" title="Browser Steps is enabled" >{% endif %}
{% if watch.last_error is defined and watch.last_error != False %}
<div class="fetch-error">{{ watch.last_error }}
{% if '403' in watch.last_error %}
{% if has_proxies %}
<a href="{{ url_for('settings.settings_page', uuid=watch.uuid) }}#proxies">Try other proxies/location</a>&nbsp;
{% endif %}
<a href="{{ url_for('settings.settings_page', uuid=watch.uuid) }}#proxies">Try adding external proxies/locations</a>
{% endif %}
{% if 'empty result or contain only an image' in watch.last_error %}
<a href="https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images">more help here</a>.
@@ -166,7 +168,7 @@
<span class="watch-tag-list">{{ watch_tag.title }}</span>
{% endfor %}
</td>
<!-- @todo make it so any watch handler obj can expose this --->
{% if any_has_restock_price_processor %}
<td class="restock-and-price">
{% if watch['processor'] == 'restock_diff' %}

View File

@@ -1,11 +1,9 @@
from flask import Blueprint
from json_logic.builtins import BUILTINS
from .exceptions import EmptyConditionRuleRowNotUsable
from .pluggy_interface import plugin_manager # Import the pluggy plugin manager
from . import default_plugin
from loguru import logger
# List of all supported JSON Logic operators
operator_choices = [
(None, "Choose one - Operator"),
@@ -113,12 +111,14 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
application_datastruct=application_datastruct,
ephemeral_data=ephemeral_data
)
logger.debug(f"Trying plugin {plugin}....")
# Set a timeout of 10 seconds
try:
new_execute_data = future.result(timeout=10)
if new_execute_data and isinstance(new_execute_data, dict):
EXECUTE_DATA.update(new_execute_data)
except concurrent.futures.TimeoutError:
# The plugin took too long, abort processing for this watch
raise Exception(f"Plugin {plugin.__class__.__name__} took more than 10 seconds to run.")

View File

@@ -9,15 +9,20 @@ def levenshtein_ratio_recent_history(watch, incoming_text=None):
try:
from Levenshtein import ratio, distance
k = list(watch.history.keys())
if len(k) >= 2:
# When called from ui_edit_stats_extras, we don't have incoming_text
if incoming_text is None:
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot
b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot
else:
a = watch.get_history_snapshot(timestamp=k[-2]) # Second newest, incoming_text will be "newest"
b = incoming_text
a = None
b = None
# When called from ui_edit_stats_extras, we don't have incoming_text
if incoming_text is None:
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot
b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot
# Needs atleast one snapshot
elif len(k) >= 1: # Should be atleast one snapshot to compare against
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest saved snapshot
b = incoming_text if incoming_text else k[-2]
if a and b:
distance_value = distance(a, b)
ratio_value = ratio(a, b)
return {
@@ -53,7 +58,7 @@ def add_data(current_watch_uuid, application_datastruct, ephemeral_data):
# ephemeral_data['text'] will be the current text after filters, they may have edited filters but not saved them yet etc
if watch and 'text' in ephemeral_data:
lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data['text'])
lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data.get('text',''))
if isinstance(lev_data, dict):
res['levenshtein_ratio'] = lev_data.get('ratio', 0)
res['levenshtein_similarity'] = lev_data.get('percent_similar', 0)

View File

@@ -194,7 +194,6 @@ class fetcher(Fetcher):
browsersteps_interface.page = self.page
response = browsersteps_interface.action_goto_url(value=url)
self.headers = response.all_headers()
if response is None:
context.close()
@@ -202,6 +201,8 @@ class fetcher(Fetcher):
logger.debug("Content Fetcher > Response object from the browser communication was none")
raise EmptyReply(url=url, status_code=None)
self.headers = response.all_headers()
try:
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)

View File

@@ -28,6 +28,7 @@ class fetcher(Fetcher):
import chardet
import requests
from requests.exceptions import ProxyError, ConnectionError, RequestException
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -52,14 +53,19 @@ class fetcher(Fetcher):
if strtobool(os.getenv('ALLOW_FILE_URI', 'false')) and url.startswith('file://'):
from requests_file import FileAdapter
session.mount('file://', FileAdapter())
r = session.request(method=request_method,
data=request_body.encode('utf-8') if type(request_body) is str else request_body,
url=url,
headers=request_headers,
timeout=timeout,
proxies=proxies,
verify=False)
try:
r = session.request(method=request_method,
data=request_body.encode('utf-8') if type(request_body) is str else request_body,
url=url,
headers=request_headers,
timeout=timeout,
proxies=proxies,
verify=False)
except Exception as e:
msg = str(e)
if proxies and 'SOCKSHTTPSConnectionPool' in msg:
msg = f"Proxy connection failed? {msg}"
raise Exception(msg) from e
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
# For example - some sites don't tell us it's utf-8, but return utf-8 content

View File

@@ -51,6 +51,7 @@ async () => {
'niet op voorraad',
'no disponible',
'no featured offers available',
'no longer available',
'no longer in stock',
'no tickets available',
'non disponibile',
@@ -125,6 +126,20 @@ async () => {
// so it's good to filter to just the 'above the fold' elements
// and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
function elementIsInEyeBallRange(element) {
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
// Note: theres also an automated test that places the 'out of stock' text fairly low down
// Skip text that could be in the header area
if (element.getBoundingClientRect().bottom + window.scrollY <= 300 ) {
return false;
}
// Skip text that could be much further down (like a list of "you may like" products that have 'sold out' in there
if (element.getBoundingClientRect().bottom + window.scrollY >= 1300 ) {
return false;
}
return true;
}
// @todo - if it's SVG or IMG, go into image diff mode
@@ -161,9 +176,7 @@ async () => {
for (let i = elementsToScan.length - 1; i >= 0; i--) {
const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
if (!elementIsInEyeBallRange(element)) {
continue
}
@@ -177,11 +190,11 @@ async () => {
} catch (e) {
console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e);
}
if (elementText.length) {
// try which ones could mean its in stock
if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
element.style.border = "2px solid green"; // highlight the element that was detected as in stock
return 'Possibly in stock';
}
}
@@ -190,10 +203,8 @@ async () => {
// OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
for (let i = elementsToScan.length - 1; i >= 0; i--) {
const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
// Note: theres also an automated test that places the 'out of stock' text fairly low down
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
if (!elementIsInEyeBallRange(element)) {
continue
}
elementText = "";
@@ -208,6 +219,7 @@ async () => {
for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) {
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
element.style.border = "2px solid red"; // highlight the element that was detected as out of stock
return outOfStockText; // item is out of stock
}
}

View File

@@ -10,16 +10,13 @@ class fetcher(Fetcher):
else:
fetcher_description = "WebDriver Chrome/Javascript"
# Configs for Proxy setup
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None
proxy_url = None
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
from urllib.parse import urlparse
from selenium.webdriver.common.proxy import Proxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not custom_browser_connection_url:
@@ -28,25 +25,27 @@ class fetcher(Fetcher):
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.selenium_proxy_settings_mappings:
v = os.getenv('webdriver_' + k, False)
if v:
proxy_args[k] = v.strip('"')
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
proxy_args['httpProxy'] = self.system_http_proxy
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
proxy_args['httpsProxy'] = self.system_https_proxy
##### PROXY SETUP #####
# Allows override the proxy on a per-request basis
if proxy_override is not None:
proxy_args['httpProxy'] = proxy_override
proxy_sources = [
self.system_http_proxy,
self.system_https_proxy,
os.getenv('webdriver_proxySocks'),
os.getenv('webdriver_socksProxy'),
os.getenv('webdriver_proxyHttp'),
os.getenv('webdriver_httpProxy'),
os.getenv('webdriver_proxyHttps'),
os.getenv('webdriver_httpsProxy'),
os.getenv('webdriver_sslProxy'),
proxy_override, # last one should override
]
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
for k in filter(None, proxy_sources):
if not k:
continue
self.proxy_url = k.strip()
if proxy_args:
self.proxy = SeleniumProxy(raw=proxy_args)
def run(self,
url,
@@ -59,9 +58,7 @@ class fetcher(Fetcher):
is_binary=False,
empty_pages_are_a_change=False):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.common.exceptions import WebDriverException
# request_body, request_method unused for now, until some magic in the future happens.
options = ChromeOptions()
@@ -76,59 +73,62 @@ class fetcher(Fetcher):
for opt in CHROME_OPTIONS:
options.add_argument(opt)
if self.proxy:
options.proxy = self.proxy
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
# 3. selenium only allows ONE runner at a time by default!
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
self.driver = webdriver.Remote(
command_executor=self.browser_connection_url,
options=options)
if self.proxy_url:
options.add_argument(f'--proxy-server={self.proxy_url}')
from selenium.webdriver.remote.remote_connection import RemoteConnection
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
driver = None
try:
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
remote_connection = RemoteConnection(
self.browser_connection_url,
)
remote_connection.set_timeout(30) # seconds
# Now create the driver with the RemoteConnection
driver = RemoteWebDriver(
command_executor=remote_connection,
options=options
)
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
except Exception as e:
if driver:
driver.quit()
raise e
try:
self.driver.get(url)
except WebDriverException as e:
# Be sure we close the session window
self.quit()
raise
driver.get(url)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
self.driver.set_window_size(1280, 1024)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = driver.page_source
self.headers = {}
self.screenshot = driver.get_screenshot_as_png()
except Exception as e:
driver.quit()
raise e
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = self.driver.page_source
self.headers = {}
driver.quit()
self.screenshot = self.driver.get_screenshot_as_png()
# Does the connection to the webdriver work? run a test connection.
def is_ready(self):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
self.driver = webdriver.Remote(
command_executor=self.command_executor,
options=ChromeOptions())
# driver.quit() seems to cause better exceptions
self.quit()
return True
def quit(self, watch=None):
if self.driver:
try:
self.driver.quit()
except Exception as e:
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")

View File

@@ -378,6 +378,42 @@ def changedetection_app(config=None, datastore_o=None):
except FileNotFoundError:
abort(404)
if group == 'thumbnail':
# Could be sensitive, follow password requirements
if datastore.data['settings']['application']['password'] and not flask_login.current_user.is_authenticated:
abort(403)
# Get the watch object
watch = datastore.data['watching'].get(filename)
if not watch:
abort(404)
# Generate thumbnail if needed
max_age = int(request.args.get('max_age', '3200'))
thumbnail_path = watch.get_screenshot_as_thumbnail(max_age=max_age)
if not thumbnail_path:
abort(404)
try:
# Get file modification time for ETag
file_mtime = int(os.path.getmtime(thumbnail_path))
etag = f'"{file_mtime}"'
# Check if browser has valid cached version
if request.if_none_match and etag in request.if_none_match:
return "", 304 # Not Modified
# Set up response with appropriate cache headers
response = make_response(send_from_directory(os.path.dirname(thumbnail_path), os.path.basename(thumbnail_path)))
response.headers['Content-type'] = 'image/jpeg'
response.headers['ETag'] = etag
response.headers['Cache-Control'] = 'max-age=300, must-revalidate' # Cache for 5 minutes, then revalidate
return response
except FileNotFoundError:
abort(404)
if group == 'visual_selector_data':
# Could be sensitive, follow password requirements

View File

@@ -224,27 +224,37 @@ class StringDictKeyValue(StringField):
def _value(self):
if self.data:
output = u''
for k in self.data.keys():
output += "{}: {}\r\n".format(k, self.data[k])
output = ''
for k, v in self.data.items():
output += f"{k}: {v}\r\n"
return output
else:
return u''
return ''
# incoming
# incoming data processing + validation
def process_formdata(self, valuelist):
self.data = {}
errors = []
if valuelist:
self.data = {}
# Remove empty strings
cleaned = list(filter(None, valuelist[0].split("\n")))
for s in cleaned:
parts = s.strip().split(':', 1)
if len(parts) == 2:
self.data.update({parts[0].strip(): parts[1].strip()})
# Remove empty strings (blank lines)
cleaned = [line.strip() for line in valuelist[0].split("\n") if line.strip()]
for idx, s in enumerate(cleaned, start=1):
if ':' not in s:
errors.append(f"Line {idx} is missing a ':' separator.")
continue
parts = s.split(':', 1)
key = parts[0].strip()
value = parts[1].strip()
else:
self.data = {}
if not key:
errors.append(f"Line {idx} has an empty key.")
if not value:
errors.append(f"Line {idx} has an empty value.")
self.data[key] = value
if errors:
raise ValidationError("Invalid input:\n" + "\n".join(errors))
class ValidateContentFetcherIsReady(object):
"""

View File

@@ -309,10 +309,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
soup = BeautifulSoup(content, 'html.parser')
if ensure_is_ldjson_info_type:
bs_result = soup.findAll('script', {"type": "application/ld+json"})
bs_result = soup.find_all('script', {"type": "application/ld+json"})
else:
bs_result = soup.findAll('script')
bs_result += soup.findAll('body')
bs_result = soup.find_all('script')
bs_result += soup.find_all('body')
bs_jsons = []
for result in bs_result:
@@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content)
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
"""Converts html string to a string with just the text. If ignoring
rendering anchor tag content is enable, anchor tag content are also
included in the text
:param html_content: string with html content
:param render_anchor_tag_content: boolean flag indicating whether to extract
hyperlinks (the anchor tag content) together with text. This refers to the
'href' inside 'a' tags.
Anchor tag content is rendered in the following manner:
'[ text ](anchor tag content)'
:return: extracted text from the HTML
"""
# if anchor tag content flag is set to True define a config for
# extracting this content
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
# otherwise set config to None/default
else:
parser_config = None
# RSS Mode - Inscriptis will treat `title` as something else.
# Make it as a regular block display element (//item/title)
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
conn.send(text_content)
conn.close()
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
return text_content
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):

View File

@@ -401,6 +401,70 @@ class model(watch_base):
# False is not an option for AppRise, must be type None
return None
def get_screenshot_as_thumbnail(self, max_age=3200):
"""Return path to a square thumbnail of the most recent screenshot.
Creates a 150x150 pixel thumbnail from the top portion of the screenshot.
Args:
max_age: Maximum age in seconds before recreating thumbnail
Returns:
Path to thumbnail or None if no screenshot exists
"""
import os
import time
thumbnail_path = os.path.join(self.watch_data_dir, "thumbnail.jpeg")
top_trim = 500 # Pixels from top of screenshot to use
screenshot_path = self.get_screenshot()
if not screenshot_path:
return None
# Reuse thumbnail if it's fresh and screenshot hasn't changed
if os.path.isfile(thumbnail_path):
thumbnail_mtime = os.path.getmtime(thumbnail_path)
screenshot_mtime = os.path.getmtime(screenshot_path)
if screenshot_mtime <= thumbnail_mtime and time.time() - thumbnail_mtime < max_age:
return thumbnail_path
try:
from PIL import Image
with Image.open(screenshot_path) as img:
# Crop top portion first (full width, top_trim height)
top_crop_height = min(top_trim, img.height)
img = img.crop((0, 0, img.width, top_crop_height))
# Create a smaller intermediate image (to reduce memory usage)
aspect = img.width / img.height
interim_width = min(top_trim, img.width)
interim_height = int(interim_width / aspect) if aspect > 0 else top_trim
img = img.resize((interim_width, interim_height), Image.NEAREST)
# Convert to RGB if needed
if img.mode != 'RGB':
img = img.convert('RGB')
# Crop to square from top center
square_size = min(img.width, img.height)
left = (img.width - square_size) // 2
img = img.crop((left, 0, left + square_size, square_size))
# Final resize to exact thumbnail size with better filter
img = img.resize((150, 150), Image.BILINEAR)
# Save with optimized settings
img.save(thumbnail_path, "JPEG", quality=75, optimize=True)
return thumbnail_path
except Exception as e:
logger.error(f"Error creating thumbnail for {self.get('uuid')}: {str(e)}")
return None
def __get_file_ctime(self, filename):
fname = os.path.join(self.watch_data_dir, filename)
if os.path.isfile(fname):

View File

@@ -38,6 +38,9 @@ pytest tests/test_backend.py
pytest tests/test_rss.py
pytest tests/test_unique_lines.py
# Try high concurrency
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
# Check file:// will pickup a file when enabled
echo "Hello world" > /tmp/test-file.txt
ALLOW_FILE_URI=yes pytest tests/test_security.py

View File

@@ -82,3 +82,25 @@ done
docker kill squid-one squid-two squid-custom
# Test that the UI is returning the correct error message when a proxy is not available
# Requests
docker run --network changedet-network \
test-changedetectionio \
bash -c 'cd changedetectionio && pytest tests/proxy_list/test_proxy_noconnect.py'
# Playwright
docker run --network changedet-network \
test-changedetectionio \
bash -c 'cd changedetectionio && PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
# Puppeteer fast
docker run --network changedet-network \
test-changedetectionio \
bash -c 'cd changedetectionio && FAST_PUPPETEER_CHROME_FETCHER=1 PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
# Selenium
docker run --network changedet-network \
test-changedetectionio \
bash -c 'cd changedetectionio && WEBDRIVER_URL=http://selenium:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py'

View File

@@ -0,0 +1,30 @@
.watch-table {
td,
th {
vertical-align: middle;
}
td.inline.title-col {
display: flex;
align-items: center;
gap: 0.5em;
flex-wrap: wrap;
* {
display: inline-block;
}
img.thumbnail {
width: 32px;
object-fit: cover; /* crop/fill if needed */
border-radius: 8px; /* subtle rounded corners */
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.15); /* soft shadow */
border: 1px solid #ddd; /* light border for contrast */
filter: contrast(1.05) saturate(1.1) drop-shadow(0 0 0.5px rgba(0, 0, 0, 0.2));
background-color: #fff; /* fallback bg for SVGs without bg */
}
}
}

View File

@@ -15,6 +15,7 @@
@import "parts/preview_text_filter";
@import "parts/_edit";
@import "parts/_conditions_table";
@import "parts/_lister_extra";
body {
color: var(--color-text);

View File

@@ -623,6 +623,31 @@ ul#conditions_match_logic {
.fieldlist_formfields .addRuleRow:hover, .fieldlist_formfields .removeRuleRow:hover, .fieldlist_formfields .verifyRuleRow:hover {
background-color: #999; }
.watch-table td,
.watch-table th {
vertical-align: middle; }
.watch-table td.inline.title-col {
display: flex;
align-items: center;
gap: 0.5em;
flex-wrap: wrap; }
.watch-table td.inline.title-col * {
display: inline-block; }
.watch-table td.inline.title-col img.thumbnail {
width: 32px;
object-fit: cover;
/* crop/fill if needed */
border-radius: 8px;
/* subtle rounded corners */
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.15);
/* soft shadow */
border: 1px solid #ddd;
/* light border for contrast */
filter: contrast(1.05) saturate(1.1) drop-shadow(0 0 0.5px rgba(0, 0, 0, 0.2));
background-color: #fff;
/* fallback bg for SVGs without bg */ }
body {
color: var(--color-text);
background: var(--color-background-page);

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python3
from flask import url_for
from ..util import live_server_setup, wait_for_all_checks
import os
from ... import strtobool
# Just to be sure the UI outputs the right error message on proxy connection failed
# docker run -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4
# PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
# FAST_PUPPETEER_CHROME_FETCHER=True PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
# WEBDRIVER_URL=http://127.0.0.1:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py
def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
live_server_setup(live_server)
# Goto settings, add our custom one
res = client.post(
url_for("settings.settings_page"),
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests',
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
# test:awesome is set in tests/proxy_list/squid-passwords.txt
"requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128",
},
follow_redirects=True
)
assert b"Settings updated." in res.data
test_url = "https://changedetection.io"
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
follow_redirects=True
)
assert b"Watch added in Paused state, saving will unpause" in res.data
options = {
"url": test_url,
"fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests",
"proxy": "ui-0custom-test-proxy",
}
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
data=options,
follow_redirects=True
)
assert b"unpaused" in res.data
import time
wait_for_all_checks(client)
# Requests default
check_string = b'Cannot connect to proxy'
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or os.getenv("WEBDRIVER_URL"):
check_string = b'ERR_PROXY_CONNECTION_FAILED'
res = client.get(url_for("watchlist.index"))
#with open("/tmp/debug.html", 'wb') as f:
# f.write(res.data)
assert check_string in res.data

View File

@@ -14,6 +14,8 @@ from changedetectionio.notification import (
def set_original_response():
test_return_data = """<html>
<body>
<section id=header style="padding: 50px; height: 350px">This is the header which should be ignored always - <span>add to cart</span></section>
<!-- stock-not-in-stock.js will ignore text in the first 300px, see elementIsInEyeBallRange(), sometimes "add to cart" and other junk is here -->
Some initial text<br>
<p>Which is across multiple lines</p>
<br>
@@ -52,8 +54,6 @@ def test_restock_detection(client, live_server, measure_memory_usage):
set_original_response()
#assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
time.sleep(1)
live_server_setup(live_server)
#####################
notification_url = url_for('test_notification_endpoint', _external=True).replace('http://localhost', 'http://changedet').replace('http', 'json')
@@ -84,7 +84,8 @@ def test_restock_detection(client, live_server, measure_memory_usage):
# Is it correctly show as NOT in stock?
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'not-in-stock' in res.data
assert b'processor-restock_diff' in res.data # Should have saved in restock mode
assert b'not-in-stock' in res.data # should be out of stock
# Is it correctly shown as in stock
set_back_in_stock_response()

View File

@@ -196,7 +196,11 @@ def test_condition_validate_rule_row(client, live_server):
)
assert res.status_code == 200
assert b'false' in res.data
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)
@@ -235,4 +239,107 @@ def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage):
)
# Assert the word count is counted correctly
assert b'<td>13</td>' in res.data
assert b'<td>13</td>' in res.data
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_lev_conditions_plugin(client, live_server, measure_memory_usage):
#live_server_setup(live_server)
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""<html>
<body>
Some initial text<br>
<p>Which is across multiple lines</p>
<br>
So let's see what happens. <br>
</body>
</html>
""")
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
follow_redirects=True
)
assert b"Watch added in Paused state, saving will unpause" in res.data
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
# Give the thread time to pick it up
wait_for_all_checks(client)
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1),
data={
"url": test_url,
"fetch_backend": "html_requests",
"conditions_match_logic": "ALL", # ALL = AND logic
"conditions-0-field": "levenshtein_ratio",
"conditions-0-operator": "<",
"conditions-0-value": "0.8" # needs to be more of a diff to trigger a change
},
follow_redirects=True
)
assert b"unpaused" in res.data
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'unviewed' not in res.data
# Check the content saved initially, even tho a condition was set - this is the first snapshot so shouldnt be affected by conditions
res = client.get(
url_for("ui.ui_views.preview_page", uuid=uuid),
follow_redirects=True
)
assert b'Which is across multiple lines' in res.data
############### Now change it a LITTLE bit...
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""<html>
<body>
Some initial text<br>
<p>Which is across multiple lines</p>
<br>
So let's see what happenxxxxxxxxx. <br>
</body>
</html>
""")
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b'Queued 1 watch for rechecking.' in res.data
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'unviewed' not in res.data #because this will be like 0.90 not 0.8 threshold
############### Now change it a MORE THAN 50%
test_return_data = """<html>
<body>
Some sxxxx<br>
<p>Which is across a lines</p>
<br>
ok. <br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b'Queued 1 watch for rechecking.' in res.data
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'unviewed' in res.data
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)

View File

@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):
live_server_setup(live_server)
r = range(1, 30)
workers = int(os.getenv("FETCH_WORKERS", 10))
r = range(1, 10+workers)
for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage):
# assert the right amount of watches was found in the JSON
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
i=0
# each one should have a history.txt containing just one line
for w in json_obj['watching'].keys():
i+=1
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
@@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(tmp_history) == 1, "History.txt should contain 1 line"
# Should be two files,. the history.txt , and the snapshot.txt
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
w))
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and 'html' not in fname:
@@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
with open(json_db_file, 'r') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"

View File

@@ -424,3 +424,27 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage):
# unlink headers.txt on start/stop
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_headers_validation(client, live_server):
#live_server_setup(live_server)
test_url = url_for('test_headers', _external=True)
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"fetch_backend": 'html_requests',
"headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar"},
follow_redirects=True
)
assert b"Line 1 is missing a &#39;:&#39; separator." in res.data
assert b"Line 3 has an empty key." in res.data

View File

@@ -126,18 +126,51 @@ def extract_UUID_from_client(client):
uuid = m.group(1)
return uuid.strip()
def wait_for_all_checks(client):
# actually this is not entirely true, it can still be 'processing' but not in the queue
# Loop waiting until done..
attempt=0
# because sub-second rechecks are problematic in testing, use lots of delays
time.sleep(1)
while attempt < 60:
res = client.get(url_for("watchlist.index"))
if not b'Checking now' in res.data:
break
logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt))
time.sleep(1)
def wait_for_all_checks(client=None):
"""
Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
and also ensures no running threads have `current_uuid` set.
Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
"""
from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
# Configuration
attempt = 0
i=0
max_attempts = 60
wait_between_attempts = 2
required_empty_duration = 2
logger = logging.getLogger()
time.sleep(1.2)
empty_since = None
while attempt < max_attempts:
q_length = global_update_q.qsize()
# Check if any threads are still processing
time.sleep(1.2)
any_threads_busy = any(t.current_uuid for t in running_update_threads)
if q_length == 0 and not any_threads_busy:
if empty_since is None:
empty_since = time.time()
logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
elif time.time() - empty_since >= required_empty_duration:
logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
break
else:
logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
else:
if q_length != 0:
logger.info(f"Queue not empty (size={q_length}), resetting timer.")
if any_threads_busy:
busy_threads = [t.name for t in running_update_threads if t.current_uuid]
logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
empty_since = None
attempt += 1
time.sleep(1)

View File

@@ -42,7 +42,7 @@ paho-mqtt!=2.0.*
cryptography~=42.0.8
# Used for CSS filtering
beautifulsoup4
beautifulsoup4>=4.0.0
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
# #2328 - 5.2.0 and 5.2.1 had extra CPU flag CFLAGS set which was not compatible on older hardware
@@ -53,7 +53,7 @@ lxml >=4.8.0,<6,!=5.2.0,!=5.2.1
# XPath 2.0-3.1 support - 4.2.0 broke something?
elementpath==4.1.5
selenium~=4.14.0
selenium~=4.31.0
# https://github.com/pallets/werkzeug/issues/2985
# Maybe related to pytest?
@@ -70,7 +70,7 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
# playwright is installed at Dockerfile build time because it's not available on all platforms
pyppeteer-ng==2.0.0rc9
pyppeteer-ng==2.0.0rc10
pyppeteerstealth>=0.0.4
@@ -90,6 +90,8 @@ extruct
# For cleaning up unknown currency formats
babel
levenshtein
# Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
greenlet >= 3.0.3