Compare commits
39 Commits
0.49.10
...
improved-g
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb4cd35317 | ||
|
|
dc75043562 | ||
|
|
38fffda890 | ||
|
|
af568d064c | ||
|
|
a75f57de43 | ||
|
|
72a1c3dda1 | ||
|
|
ffde79ecac | ||
|
|
66ad43b2df | ||
|
|
6b0e56ca80 | ||
|
|
5a2d84d8b4 | ||
|
|
a941156f26 | ||
|
|
a1fdeeaa29 | ||
|
|
40ea2604a7 | ||
|
|
ceda526093 | ||
|
|
4197254c53 | ||
|
|
a0b7efb436 | ||
|
|
5f5e8ede6c | ||
|
|
52ca855a29 | ||
|
|
079efd0a85 | ||
|
|
3a583a4e5d | ||
|
|
cfb4decf67 | ||
|
|
8067d5170b | ||
|
|
5551acf67d | ||
|
|
45a030bac6 | ||
|
|
96dc49e229 | ||
|
|
5f43d988a3 | ||
|
|
4269079c54 | ||
|
|
cdfb3f206c | ||
|
|
9f326783e5 | ||
|
|
4e6e680d79 | ||
|
|
1378b5b2ff | ||
|
|
456c6e3f58 | ||
|
|
61be7f68db | ||
|
|
0e38a3c881 | ||
|
|
2c630e9853 | ||
|
|
786e0d1fab | ||
|
|
78b7aee512 | ||
|
|
9d9d01863a | ||
|
|
108cdf84a5 |
15
.github/workflows/test-only.yml
vendored
15
.github/workflows/test-only.yml
vendored
@@ -8,13 +8,13 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Lint with flake8
|
||||
- name: Lint with Ruff
|
||||
run: |
|
||||
pip3 install flake8
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
pip install ruff
|
||||
# Check for syntax errors and undefined names
|
||||
ruff check . --select E9,F63,F7,F82
|
||||
# Complete check with errors treated as warnings
|
||||
ruff check . --exit-zero
|
||||
|
||||
test-application-3-10:
|
||||
needs: lint-code
|
||||
@@ -41,5 +41,4 @@ jobs:
|
||||
uses: ./.github/workflows/test-stack-reusable-workflow.yml
|
||||
with:
|
||||
python-version: '3.13'
|
||||
skip-pypuppeteer: true
|
||||
|
||||
skip-pypuppeteer: true
|
||||
@@ -172,8 +172,8 @@ jobs:
|
||||
curl --retry-connrefused --retry 6 -s -g -6 "http://[::1]:5556"|grep -q checkbox-uuid
|
||||
|
||||
# Check whether TRACE log is enabled.
|
||||
# Also, check whether TRACE is came from STDERR
|
||||
docker logs test-changedetectionio 2>&1 1>/dev/null | grep 'TRACE log is enabled' || exit 1
|
||||
# Also, check whether TRACE came from STDOUT
|
||||
docker logs test-changedetectionio 2>/dev/null | grep 'TRACE log is enabled' || exit 1
|
||||
# Check whether DEBUG is came from STDOUT
|
||||
docker logs test-changedetectionio 2>/dev/null | grep 'DEBUG' || exit 1
|
||||
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ dist/
|
||||
.env
|
||||
.venv/
|
||||
venv/
|
||||
.python-version
|
||||
|
||||
# IDEs
|
||||
.idea
|
||||
|
||||
9
.pre-commit-config.yaml
Normal file
9
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.11.2
|
||||
hooks:
|
||||
# Lint (and apply safe fixes)
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
# Fomrat
|
||||
- id: ruff-format
|
||||
48
.ruff.toml
Normal file
48
.ruff.toml
Normal file
@@ -0,0 +1,48 @@
|
||||
# Minimum supported version
|
||||
target-version = "py310"
|
||||
|
||||
# Formatting options
|
||||
line-length = 100
|
||||
indent-width = 4
|
||||
|
||||
exclude = [
|
||||
"__pycache__",
|
||||
".eggs",
|
||||
".git",
|
||||
".tox",
|
||||
".venv",
|
||||
"*.egg-info",
|
||||
"*.pyc",
|
||||
]
|
||||
|
||||
[lint]
|
||||
# https://docs.astral.sh/ruff/rules/
|
||||
select = [
|
||||
"B", # flake8-bugbear
|
||||
"B9",
|
||||
"C",
|
||||
"E", # pycodestyle
|
||||
"F", # Pyflakes
|
||||
"I", # isort
|
||||
"N", # pep8-naming
|
||||
"UP", # pyupgrade
|
||||
"W", # pycodestyle
|
||||
]
|
||||
ignore = [
|
||||
"B007", # unused-loop-control-variable
|
||||
"B909", # loop-iterator-mutation
|
||||
"E203", # whitespace-before-punctuation
|
||||
"E266", # multiple-leading-hashes-for-block-comment
|
||||
"E501", # redundant-backslash
|
||||
"F403", # undefined-local-with-import-star
|
||||
"N802", # invalid-function-name
|
||||
"N806", # non-lowercase-variable-in-function
|
||||
"N815", # mixed-case-variable-in-class-scope
|
||||
]
|
||||
|
||||
[lint.mccabe]
|
||||
max-complexity = 12
|
||||
|
||||
[format]
|
||||
indent-style = "space"
|
||||
quote-style = "preserve"
|
||||
@@ -68,7 +68,7 @@ COPY changedetection.py /app/changedetection.py
|
||||
# Github Action test purpose(test-only.yml).
|
||||
# On production, it is effectively LOGGER_LEVEL=''.
|
||||
ARG LOGGER_LEVEL=''
|
||||
ENV LOGGER_LEVEL "$LOGGER_LEVEL"
|
||||
ENV LOGGER_LEVEL="$LOGGER_LEVEL"
|
||||
|
||||
WORKDIR /app
|
||||
CMD ["python", "./changedetection.py", "-d", "/datastore"]
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
recursive-include changedetectionio/api *
|
||||
recursive-include changedetectionio/apprise_plugin *
|
||||
recursive-include changedetectionio/blueprint *
|
||||
recursive-include changedetectionio/content_fetchers *
|
||||
recursive-include changedetectionio/conditions *
|
||||
recursive-include changedetectionio/model *
|
||||
recursive-include changedetectionio/notification *
|
||||
recursive-include changedetectionio/processors *
|
||||
recursive-include changedetectionio/static *
|
||||
recursive-include changedetectionio/templates *
|
||||
|
||||
@@ -105,6 +105,12 @@ We [recommend and use Bright Data](https://brightdata.grsm.io/n0r16zf7eivq) glob
|
||||
|
||||
Please :star: star :star: this project and help it grow! https://github.com/dgtlmoon/changedetection.io/
|
||||
|
||||
### Conditional web page changes
|
||||
|
||||
Easily [configure conditional actions](https://changedetection.io/tutorial/conditional-actions-web-page-changes), for example, only trigger when a price is above or below a preset amount, or [when a web page includes (or does not include) a keyword](https://changedetection.io/tutorial/how-monitor-keywords-any-website)
|
||||
|
||||
<img src="./docs/web-page-change-conditions.png" style="max-width:80%;" alt="Conditional web page changes" title="Conditional web page changes" />
|
||||
|
||||
### Schedule web page watches in any timezone, limit by day of week and time.
|
||||
|
||||
Easily set a re-check schedule, for example you could limit the web page change detection to only operate during business hours.
|
||||
|
||||
@@ -3,4 +3,6 @@
|
||||
# Only exists for direct CLI usage
|
||||
|
||||
import changedetectionio
|
||||
changedetectionio.main()
|
||||
|
||||
if __name__ == '__main__':
|
||||
changedetectionio.main()
|
||||
|
||||
98
changedetectionio/PLUGIN_README.md
Normal file
98
changedetectionio/PLUGIN_README.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Creating Plugins for changedetection.io
|
||||
|
||||
This document describes how to create plugins for changedetection.io. Plugins can be used to extend the functionality of the application in various ways.
|
||||
|
||||
## Plugin Types
|
||||
|
||||
### UI Stats Tab Plugins
|
||||
|
||||
These plugins can add content to the Stats tab in the Edit page. This is useful for adding custom statistics or visualizations about a watch.
|
||||
|
||||
#### Creating a UI Stats Tab Plugin
|
||||
|
||||
1. Create a Python file in a directory that will be loaded by the plugin system.
|
||||
|
||||
2. Use the `global_hookimpl` decorator to implement the `ui_edit_stats_extras` hook:
|
||||
|
||||
```python
|
||||
import pluggy
|
||||
from loguru import logger
|
||||
|
||||
global_hookimpl = pluggy.HookimplMarker("changedetectionio")
|
||||
|
||||
@global_hookimpl
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Add custom content to the stats tab"""
|
||||
# Calculate or retrieve your stats
|
||||
my_stat = calculate_something(watch)
|
||||
|
||||
# Return HTML content as a string
|
||||
html = f"""
|
||||
<div class="my-plugin-stats">
|
||||
<h4>My Plugin Statistics</h4>
|
||||
<p>My statistic: {my_stat}</p>
|
||||
</div>
|
||||
"""
|
||||
return html
|
||||
```
|
||||
|
||||
3. The HTML you return will be included in the Stats tab.
|
||||
|
||||
## Plugin Loading
|
||||
|
||||
Plugins can be loaded from:
|
||||
|
||||
1. Built-in plugin directories in the codebase
|
||||
2. External packages using setuptools entry points
|
||||
|
||||
To add a new plugin directory, modify the `plugin_dirs` dictionary in `pluggy_interface.py`.
|
||||
|
||||
## Example Plugin
|
||||
|
||||
Here's a simple example of a plugin that adds a word count statistic to the Stats tab:
|
||||
|
||||
```python
|
||||
import pluggy
|
||||
from loguru import logger
|
||||
|
||||
global_hookimpl = pluggy.HookimplMarker("changedetectionio")
|
||||
|
||||
def count_words_in_history(watch):
|
||||
"""Count words in the latest snapshot"""
|
||||
try:
|
||||
if not watch.history.keys():
|
||||
return 0
|
||||
|
||||
latest_key = list(watch.history.keys())[-1]
|
||||
latest_content = watch.get_history_snapshot(latest_key)
|
||||
return len(latest_content.split())
|
||||
except Exception as e:
|
||||
logger.error(f"Error counting words: {str(e)}")
|
||||
return 0
|
||||
|
||||
@global_hookimpl
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Add word count to the Stats tab"""
|
||||
word_count = count_words_in_history(watch)
|
||||
|
||||
html = f"""
|
||||
<div class="word-count-stats">
|
||||
<h4>Content Analysis</h4>
|
||||
<table class="pure-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Word count (latest snapshot)</td>
|
||||
<td>{word_count}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
"""
|
||||
return html
|
||||
```
|
||||
|
||||
## Testing Your Plugin
|
||||
|
||||
1. Place your plugin in one of the directories scanned by the plugin system
|
||||
2. Restart changedetection.io
|
||||
3. Go to the Edit page of a watch and check the Stats tab to see your content
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
|
||||
__version__ = '0.49.10'
|
||||
__version__ = '0.49.15'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
@@ -11,6 +11,7 @@ os.environ['EVENTLET_NO_GREENDNS'] = 'yes'
|
||||
import eventlet
|
||||
import eventlet.wsgi
|
||||
import getopt
|
||||
import platform
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
@@ -105,7 +106,7 @@ def main():
|
||||
# Without this, a logger will be duplicated
|
||||
logger.remove()
|
||||
try:
|
||||
log_level_for_stdout = { 'DEBUG', 'SUCCESS' }
|
||||
log_level_for_stdout = { 'TRACE', 'DEBUG', 'INFO', 'SUCCESS' }
|
||||
logger.configure(handlers=[
|
||||
{"sink": sys.stdout, "level": logger_level,
|
||||
"filter" : lambda record: record['level'].name in log_level_for_stdout},
|
||||
@@ -150,9 +151,13 @@ def main():
|
||||
from changedetectionio.gc_cleanup import memory_cleanup
|
||||
logger.info('SIGUSR1 received: Running memory cleanup')
|
||||
return memory_cleanup(app)
|
||||
|
||||
|
||||
# Register the SIGUSR1 signal handler
|
||||
signal.signal(signal.SIGUSR1, sigusr_clean_handler)
|
||||
# Only register the signal handler if running on Linux
|
||||
if platform.system() == "Linux":
|
||||
signal.signal(signal.SIGUSR1, sigusr_clean_handler)
|
||||
else:
|
||||
logger.info("SIGUSR1 handler only registered on Linux, skipped.")
|
||||
|
||||
# Go into cleanup mode
|
||||
if do_cleanup:
|
||||
|
||||
145
changedetectionio/api/Notifications.py
Normal file
145
changedetectionio/api/Notifications.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from flask_expects_json import expects_json
|
||||
from flask_restful import Resource
|
||||
from . import auth
|
||||
from flask_restful import abort, Resource
|
||||
from flask import request
|
||||
from . import auth
|
||||
from . import schema_create_notification_urls, schema_delete_notification_urls
|
||||
|
||||
class Notifications(Resource):
|
||||
def __init__(self, **kwargs):
|
||||
# datastore is a black box dependency
|
||||
self.datastore = kwargs['datastore']
|
||||
|
||||
@auth.check_token
|
||||
def get(self):
|
||||
"""
|
||||
@api {get} /api/v1/notifications Return Notification URL List
|
||||
@apiDescription Return the Notification URL List from the configuration
|
||||
@apiExample {curl} Example usage:
|
||||
curl http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45"
|
||||
HTTP/1.0 200
|
||||
{
|
||||
'notification_urls': ["notification-urls-list"]
|
||||
}
|
||||
@apiName Get
|
||||
@apiGroup Notifications
|
||||
"""
|
||||
|
||||
notification_urls = self.datastore.data.get('settings', {}).get('application', {}).get('notification_urls', [])
|
||||
|
||||
return {
|
||||
'notification_urls': notification_urls,
|
||||
}, 200
|
||||
|
||||
@auth.check_token
|
||||
@expects_json(schema_create_notification_urls)
|
||||
def post(self):
|
||||
"""
|
||||
@api {post} /api/v1/notifications Create Notification URLs
|
||||
@apiDescription Add one or more notification URLs from the configuration
|
||||
@apiExample {curl} Example usage:
|
||||
curl http://localhost:5000/api/v1/notifications/batch -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
|
||||
@apiName CreateBatch
|
||||
@apiGroup Notifications
|
||||
@apiSuccess (201) {Object[]} notification_urls List of added notification URLs
|
||||
@apiError (400) {String} Invalid input
|
||||
"""
|
||||
|
||||
json_data = request.get_json()
|
||||
notification_urls = json_data.get("notification_urls", [])
|
||||
|
||||
from wtforms import ValidationError
|
||||
try:
|
||||
validate_notification_urls(notification_urls)
|
||||
except ValidationError as e:
|
||||
return str(e), 400
|
||||
|
||||
added_urls = []
|
||||
|
||||
for url in notification_urls:
|
||||
clean_url = url.strip()
|
||||
added_url = self.datastore.add_notification_url(clean_url)
|
||||
if added_url:
|
||||
added_urls.append(added_url)
|
||||
|
||||
if not added_urls:
|
||||
return "No valid notification URLs were added", 400
|
||||
|
||||
return {'notification_urls': added_urls}, 201
|
||||
|
||||
@auth.check_token
|
||||
@expects_json(schema_create_notification_urls)
|
||||
def put(self):
|
||||
"""
|
||||
@api {put} /api/v1/notifications Replace Notification URLs
|
||||
@apiDescription Replace all notification URLs with the provided list (can be empty)
|
||||
@apiExample {curl} Example usage:
|
||||
curl -X PUT http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
|
||||
@apiName Replace
|
||||
@apiGroup Notifications
|
||||
@apiSuccess (200) {Object[]} notification_urls List of current notification URLs
|
||||
@apiError (400) {String} Invalid input
|
||||
"""
|
||||
json_data = request.get_json()
|
||||
notification_urls = json_data.get("notification_urls", [])
|
||||
|
||||
from wtforms import ValidationError
|
||||
try:
|
||||
validate_notification_urls(notification_urls)
|
||||
except ValidationError as e:
|
||||
return str(e), 400
|
||||
|
||||
if not isinstance(notification_urls, list):
|
||||
return "Invalid input format", 400
|
||||
|
||||
clean_urls = [url.strip() for url in notification_urls if isinstance(url, str)]
|
||||
self.datastore.data['settings']['application']['notification_urls'] = clean_urls
|
||||
self.datastore.needs_write = True
|
||||
|
||||
return {'notification_urls': clean_urls}, 200
|
||||
|
||||
@auth.check_token
|
||||
@expects_json(schema_delete_notification_urls)
|
||||
def delete(self):
|
||||
"""
|
||||
@api {delete} /api/v1/notifications Delete Notification URLs
|
||||
@apiDescription Deletes one or more notification URLs from the configuration
|
||||
@apiExample {curl} Example usage:
|
||||
curl http://localhost:5000/api/v1/notifications -X DELETE -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
|
||||
@apiParam {String[]} notification_urls The notification URLs to delete.
|
||||
@apiName Delete
|
||||
@apiGroup Notifications
|
||||
@apiSuccess (204) {String} OK Deleted
|
||||
@apiError (400) {String} No matching notification URLs found.
|
||||
"""
|
||||
|
||||
json_data = request.get_json()
|
||||
urls_to_delete = json_data.get("notification_urls", [])
|
||||
if not isinstance(urls_to_delete, list):
|
||||
abort(400, message="Expected a list of notification URLs.")
|
||||
|
||||
notification_urls = self.datastore.data['settings']['application'].get('notification_urls', [])
|
||||
deleted = []
|
||||
|
||||
for url in urls_to_delete:
|
||||
clean_url = url.strip()
|
||||
if clean_url in notification_urls:
|
||||
notification_urls.remove(clean_url)
|
||||
deleted.append(clean_url)
|
||||
|
||||
if not deleted:
|
||||
abort(400, message="No matching notification URLs found.")
|
||||
|
||||
self.datastore.data['settings']['application']['notification_urls'] = notification_urls
|
||||
self.datastore.needs_write = True
|
||||
|
||||
return 'OK', 204
|
||||
|
||||
def validate_notification_urls(notification_urls):
|
||||
from changedetectionio.forms import ValidateAppRiseServers
|
||||
validator = ValidateAppRiseServers()
|
||||
class DummyForm: pass
|
||||
dummy_form = DummyForm()
|
||||
field = type("Field", (object,), {"data": notification_urls, "gettext": lambda self, x: x})()
|
||||
validator(dummy_form, field)
|
||||
@@ -19,8 +19,15 @@ schema_create_tag['required'] = ['title']
|
||||
schema_update_tag = copy.deepcopy(schema_tag)
|
||||
schema_update_tag['additionalProperties'] = False
|
||||
|
||||
schema_notification_urls = copy.deepcopy(schema)
|
||||
schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
|
||||
schema_create_notification_urls['required'] = ['notification_urls']
|
||||
schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
|
||||
schema_delete_notification_urls['required'] = ['notification_urls']
|
||||
|
||||
# Import all API resources
|
||||
from .Watch import Watch, WatchHistory, WatchSingleHistory, CreateWatch
|
||||
from .Tags import Tags, Tag
|
||||
from .Import import Import
|
||||
from .SystemInfo import SystemInfo
|
||||
from .Notifications import Notifications
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# Responsible for building the storage dict into a set of rules ("JSON Schema") acceptable via the API
|
||||
# Probably other ways to solve this when the backend switches to some ORM
|
||||
from changedetectionio.notification import valid_notification_formats
|
||||
|
||||
|
||||
def build_time_between_check_json_schema():
|
||||
# Setup time between check schema
|
||||
@@ -98,8 +100,6 @@ def build_watch_json_schema(d):
|
||||
}
|
||||
}
|
||||
|
||||
from changedetectionio.notification import valid_notification_formats
|
||||
|
||||
schema['properties']['notification_format'] = {'type': 'string',
|
||||
'enum': list(valid_notification_formats.keys())
|
||||
}
|
||||
|
||||
@@ -53,14 +53,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
a = "?" if not '?' in base_url else '&'
|
||||
base_url += a + f"timeout={keepalive_ms}"
|
||||
|
||||
try:
|
||||
browsersteps_start_session['browser'] = io_interface_context.chromium.connect_over_cdp(base_url)
|
||||
except Exception as e:
|
||||
if 'ECONNREFUSED' in str(e):
|
||||
return make_response('Unable to start the Playwright Browser session, is it running?', 401)
|
||||
else:
|
||||
# Other errors, bad URL syntax, bad reply etc
|
||||
return make_response(str(e), 401)
|
||||
browsersteps_start_session['browser'] = io_interface_context.chromium.connect_over_cdp(base_url)
|
||||
|
||||
proxy_id = datastore.get_preferred_proxy_for_watch(uuid=watch_uuid)
|
||||
proxy = None
|
||||
@@ -109,7 +102,16 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
|
||||
logger.debug("Starting connection with playwright")
|
||||
logger.debug("browser_steps.py connecting")
|
||||
browsersteps_sessions[browsersteps_session_id] = start_browsersteps_session(watch_uuid)
|
||||
|
||||
try:
|
||||
browsersteps_sessions[browsersteps_session_id] = start_browsersteps_session(watch_uuid)
|
||||
except Exception as e:
|
||||
if 'ECONNREFUSED' in str(e):
|
||||
return make_response('Unable to start the Playwright Browser session, is sockpuppetbrowser running? Network configuration is OK?', 401)
|
||||
else:
|
||||
# Other errors, bad URL syntax, bad reply etc
|
||||
return make_response(str(e), 401)
|
||||
|
||||
logger.debug("Starting connection with playwright - done")
|
||||
return {'browsersteps_session_id': browsersteps_session_id}
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
from random import randint
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT
|
||||
from changedetectionio.content_fetchers.base import manage_user_agent
|
||||
from changedetectionio.safe_jinja import render as jinja_render
|
||||
|
||||
@@ -35,6 +37,7 @@ browser_step_ui_config = {'Choose one': '0 0',
|
||||
'Make all child elements visible': '1 0',
|
||||
'Press Enter': '0 0',
|
||||
'Select by label': '1 1',
|
||||
'<select> by option text': '1 1',
|
||||
'Scroll down': '0 0',
|
||||
'Uncheck checkbox': '1 0',
|
||||
'Wait for seconds': '0 1',
|
||||
@@ -54,14 +57,34 @@ browser_step_ui_config = {'Choose one': '0 0',
|
||||
class steppable_browser_interface():
|
||||
page = None
|
||||
start_url = None
|
||||
|
||||
action_timeout = 10 * 1000
|
||||
|
||||
def __init__(self, start_url):
|
||||
self.start_url = start_url
|
||||
|
||||
def safe_page_operation(self, operation_fn, default_return=None):
|
||||
"""Safely execute a page operation with error handling"""
|
||||
if self.page is None:
|
||||
logger.warning("Attempted operation on None page object")
|
||||
return default_return
|
||||
|
||||
try:
|
||||
return operation_fn()
|
||||
except Exception as e:
|
||||
logger.debug(f"Page operation failed: {str(e)}")
|
||||
# Try to reclaim memory if possible
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
return default_return
|
||||
|
||||
# Convert and perform "Click Button" for example
|
||||
def call_action(self, action_name, selector=None, optional_value=None):
|
||||
if self.page is None:
|
||||
logger.warning("Cannot call action on None page object")
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
call_action_name = re.sub('[^0-9a-zA-Z]+', '_', action_name.lower())
|
||||
if call_action_name == 'choose_one':
|
||||
@@ -72,28 +95,46 @@ class steppable_browser_interface():
|
||||
if selector and selector.startswith('/') and not selector.startswith('//'):
|
||||
selector = "xpath=" + selector
|
||||
|
||||
# Check if action handler exists
|
||||
if not hasattr(self, "action_" + call_action_name):
|
||||
logger.warning(f"Action handler for '{call_action_name}' not found")
|
||||
return
|
||||
|
||||
action_handler = getattr(self, "action_" + call_action_name)
|
||||
|
||||
# Support for Jinja2 variables in the value and selector
|
||||
|
||||
if selector and ('{%' in selector or '{{' in selector):
|
||||
selector = jinja_render(template_str=selector)
|
||||
|
||||
if optional_value and ('{%' in optional_value or '{{' in optional_value):
|
||||
optional_value = jinja_render(template_str=optional_value)
|
||||
|
||||
action_handler(selector, optional_value)
|
||||
self.page.wait_for_timeout(1.5 * 1000)
|
||||
logger.debug(f"Call action done in {time.time()-now:.2f}s")
|
||||
try:
|
||||
action_handler(selector, optional_value)
|
||||
# Safely wait for timeout
|
||||
def wait_timeout():
|
||||
self.page.wait_for_timeout(1.5 * 1000)
|
||||
self.safe_page_operation(wait_timeout)
|
||||
logger.debug(f"Call action done in {time.time()-now:.2f}s")
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing action '{call_action_name}': {str(e)}")
|
||||
# Request garbage collection to free up resources after error
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
def action_goto_url(self, selector=None, value=None):
|
||||
# self.page.set_viewport_size({"width": 1280, "height": 5000})
|
||||
if not value:
|
||||
logger.warning("No URL provided for goto_url action")
|
||||
return None
|
||||
|
||||
now = time.time()
|
||||
response = self.page.goto(value, timeout=0, wait_until='load')
|
||||
# Should be the same as the puppeteer_fetch.js methods, means, load with no timeout set (skip timeout)
|
||||
#and also wait for seconds ?
|
||||
#await page.waitForTimeout(1000);
|
||||
#await page.waitForTimeout(extra_wait_ms);
|
||||
|
||||
def goto_operation():
|
||||
return self.page.goto(value, timeout=0, wait_until='load')
|
||||
|
||||
response = self.safe_page_operation(goto_operation)
|
||||
logger.debug(f"Time to goto URL {time.time()-now:.2f}s")
|
||||
return response
|
||||
|
||||
@@ -103,116 +144,218 @@ class steppable_browser_interface():
|
||||
|
||||
def action_click_element_containing_text(self, selector=None, value=''):
|
||||
logger.debug("Clicking element containing text")
|
||||
if not len(value.strip()):
|
||||
if not value or not len(value.strip()):
|
||||
return
|
||||
elem = self.page.get_by_text(value)
|
||||
if elem.count():
|
||||
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
||||
|
||||
def click_operation():
|
||||
elem = self.page.get_by_text(value)
|
||||
if elem.count():
|
||||
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
||||
|
||||
self.safe_page_operation(click_operation)
|
||||
|
||||
def action_click_element_containing_text_if_exists(self, selector=None, value=''):
|
||||
logger.debug("Clicking element containing text if exists")
|
||||
if not len(value.strip()):
|
||||
return
|
||||
elem = self.page.get_by_text(value)
|
||||
logger.debug(f"Clicking element containing text - {elem.count()} elements found")
|
||||
if elem.count():
|
||||
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
||||
else:
|
||||
if not value or not len(value.strip()):
|
||||
return
|
||||
|
||||
def click_if_exists_operation():
|
||||
elem = self.page.get_by_text(value)
|
||||
logger.debug(f"Clicking element containing text - {elem.count()} elements found")
|
||||
if elem.count():
|
||||
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
||||
|
||||
self.safe_page_operation(click_if_exists_operation)
|
||||
|
||||
def action_enter_text_in_field(self, selector, value):
|
||||
if not len(selector.strip()):
|
||||
if not selector or not len(selector.strip()):
|
||||
return
|
||||
|
||||
self.page.fill(selector, value, timeout=self.action_timeout)
|
||||
def fill_operation():
|
||||
self.page.fill(selector, value, timeout=self.action_timeout)
|
||||
|
||||
self.safe_page_operation(fill_operation)
|
||||
|
||||
def action_execute_js(self, selector, value):
|
||||
response = self.page.evaluate(value)
|
||||
return response
|
||||
if not value:
|
||||
return None
|
||||
|
||||
def evaluate_operation():
|
||||
return self.page.evaluate(value)
|
||||
|
||||
return self.safe_page_operation(evaluate_operation)
|
||||
|
||||
def action_click_element(self, selector, value):
|
||||
logger.debug("Clicking element")
|
||||
if not len(selector.strip()):
|
||||
if not selector or not len(selector.strip()):
|
||||
return
|
||||
|
||||
self.page.click(selector=selector, timeout=self.action_timeout + 20 * 1000, delay=randint(200, 500))
|
||||
def click_operation():
|
||||
self.page.click(selector=selector, timeout=self.action_timeout + 20 * 1000, delay=randint(200, 500))
|
||||
|
||||
self.safe_page_operation(click_operation)
|
||||
|
||||
def action_click_element_if_exists(self, selector, value):
|
||||
import playwright._impl._errors as _api_types
|
||||
logger.debug("Clicking element if exists")
|
||||
if not len(selector.strip()):
|
||||
return
|
||||
try:
|
||||
self.page.click(selector, timeout=self.action_timeout, delay=randint(200, 500))
|
||||
except _api_types.TimeoutError as e:
|
||||
return
|
||||
except _api_types.Error as e:
|
||||
# Element was there, but page redrew and now its long long gone
|
||||
if not selector or not len(selector.strip()):
|
||||
return
|
||||
|
||||
def click_if_exists_operation():
|
||||
try:
|
||||
self.page.click(selector, timeout=self.action_timeout, delay=randint(200, 500))
|
||||
except _api_types.TimeoutError:
|
||||
return
|
||||
except _api_types.Error:
|
||||
# Element was there, but page redrew and now its long long gone
|
||||
return
|
||||
|
||||
self.safe_page_operation(click_if_exists_operation)
|
||||
|
||||
def action_click_x_y(self, selector, value):
|
||||
if not re.match(r'^\s?\d+\s?,\s?\d+\s?$', value):
|
||||
raise Exception("'Click X,Y' step should be in the format of '100 , 90'")
|
||||
if not value or not re.match(r'^\s?\d+\s?,\s?\d+\s?$', value):
|
||||
logger.warning("'Click X,Y' step should be in the format of '100 , 90'")
|
||||
return
|
||||
|
||||
x, y = value.strip().split(',')
|
||||
x = int(float(x.strip()))
|
||||
y = int(float(y.strip()))
|
||||
self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
|
||||
try:
|
||||
x, y = value.strip().split(',')
|
||||
x = int(float(x.strip()))
|
||||
y = int(float(y.strip()))
|
||||
|
||||
def click_xy_operation():
|
||||
self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
|
||||
|
||||
self.safe_page_operation(click_xy_operation)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing x,y coordinates: {str(e)}")
|
||||
|
||||
def action__select_by_option_text(self, selector, value):
|
||||
if not selector or not len(selector.strip()):
|
||||
return
|
||||
|
||||
def select_operation():
|
||||
self.page.select_option(selector, label=value, timeout=self.action_timeout)
|
||||
|
||||
self.safe_page_operation(select_operation)
|
||||
|
||||
def action_scroll_down(self, selector, value):
|
||||
# Some sites this doesnt work on for some reason
|
||||
self.page.mouse.wheel(0, 600)
|
||||
self.page.wait_for_timeout(1000)
|
||||
def scroll_operation():
|
||||
# Some sites this doesnt work on for some reason
|
||||
self.page.mouse.wheel(0, 600)
|
||||
self.page.wait_for_timeout(1000)
|
||||
|
||||
self.safe_page_operation(scroll_operation)
|
||||
|
||||
def action_wait_for_seconds(self, selector, value):
|
||||
self.page.wait_for_timeout(float(value.strip()) * 1000)
|
||||
try:
|
||||
seconds = float(value.strip()) if value else 1.0
|
||||
|
||||
def wait_operation():
|
||||
self.page.wait_for_timeout(seconds * 1000)
|
||||
|
||||
self.safe_page_operation(wait_operation)
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"Invalid value for wait_for_seconds: {str(e)}")
|
||||
|
||||
def action_wait_for_text(self, selector, value):
|
||||
if not value:
|
||||
return
|
||||
|
||||
import json
|
||||
v = json.dumps(value)
|
||||
self.page.wait_for_function(f'document.querySelector("body").innerText.includes({v});', timeout=30000)
|
||||
|
||||
def wait_for_text_operation():
|
||||
self.page.wait_for_function(
|
||||
f'document.querySelector("body").innerText.includes({v});',
|
||||
timeout=30000
|
||||
)
|
||||
|
||||
self.safe_page_operation(wait_for_text_operation)
|
||||
|
||||
def action_wait_for_text_in_element(self, selector, value):
|
||||
if not selector or not value:
|
||||
return
|
||||
|
||||
import json
|
||||
s = json.dumps(selector)
|
||||
v = json.dumps(value)
|
||||
self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=30000)
|
||||
|
||||
def wait_for_text_in_element_operation():
|
||||
self.page.wait_for_function(
|
||||
f'document.querySelector({s}).innerText.includes({v});',
|
||||
timeout=30000
|
||||
)
|
||||
|
||||
self.safe_page_operation(wait_for_text_in_element_operation)
|
||||
|
||||
# @todo - in the future make some popout interface to capture what needs to be set
|
||||
# https://playwright.dev/python/docs/api/class-keyboard
|
||||
def action_press_enter(self, selector, value):
|
||||
self.page.keyboard.press("Enter", delay=randint(200, 500))
|
||||
def press_operation():
|
||||
self.page.keyboard.press("Enter", delay=randint(200, 500))
|
||||
|
||||
self.safe_page_operation(press_operation)
|
||||
|
||||
def action_press_page_up(self, selector, value):
|
||||
self.page.keyboard.press("PageUp", delay=randint(200, 500))
|
||||
def press_operation():
|
||||
self.page.keyboard.press("PageUp", delay=randint(200, 500))
|
||||
|
||||
self.safe_page_operation(press_operation)
|
||||
|
||||
def action_press_page_down(self, selector, value):
|
||||
self.page.keyboard.press("PageDown", delay=randint(200, 500))
|
||||
def press_operation():
|
||||
self.page.keyboard.press("PageDown", delay=randint(200, 500))
|
||||
|
||||
self.safe_page_operation(press_operation)
|
||||
|
||||
def action_check_checkbox(self, selector, value):
|
||||
self.page.locator(selector).check(timeout=self.action_timeout)
|
||||
if not selector:
|
||||
return
|
||||
|
||||
def check_operation():
|
||||
self.page.locator(selector).check(timeout=self.action_timeout)
|
||||
|
||||
self.safe_page_operation(check_operation)
|
||||
|
||||
def action_uncheck_checkbox(self, selector, value):
|
||||
self.page.locator(selector).uncheck(timeout=self.action_timeout)
|
||||
if not selector:
|
||||
return
|
||||
|
||||
def uncheck_operation():
|
||||
self.page.locator(selector).uncheck(timeout=self.action_timeout)
|
||||
|
||||
self.safe_page_operation(uncheck_operation)
|
||||
|
||||
def action_remove_elements(self, selector, value):
|
||||
"""Removes all elements matching the given selector from the DOM."""
|
||||
self.page.locator(selector).evaluate_all("els => els.forEach(el => el.remove())")
|
||||
if not selector:
|
||||
return
|
||||
|
||||
def remove_operation():
|
||||
self.page.locator(selector).evaluate_all("els => els.forEach(el => el.remove())")
|
||||
|
||||
self.safe_page_operation(remove_operation)
|
||||
|
||||
def action_make_all_child_elements_visible(self, selector, value):
|
||||
"""Recursively makes all child elements inside the given selector fully visible."""
|
||||
self.page.locator(selector).locator("*").evaluate_all("""
|
||||
els => els.forEach(el => {
|
||||
el.style.display = 'block'; // Forces it to be displayed
|
||||
el.style.visibility = 'visible'; // Ensures it's not hidden
|
||||
el.style.opacity = '1'; // Fully opaque
|
||||
el.style.position = 'relative'; // Avoids 'absolute' hiding
|
||||
el.style.height = 'auto'; // Expands collapsed elements
|
||||
el.style.width = 'auto'; // Ensures full visibility
|
||||
el.removeAttribute('hidden'); // Removes hidden attribute
|
||||
el.classList.remove('hidden', 'd-none'); // Removes common CSS hidden classes
|
||||
})
|
||||
""")
|
||||
if not selector:
|
||||
return
|
||||
|
||||
def make_visible_operation():
|
||||
self.page.locator(selector).locator("*").evaluate_all("""
|
||||
els => els.forEach(el => {
|
||||
el.style.display = 'block'; // Forces it to be displayed
|
||||
el.style.visibility = 'visible'; // Ensures it's not hidden
|
||||
el.style.opacity = '1'; // Fully opaque
|
||||
el.style.position = 'relative'; // Avoids 'absolute' hiding
|
||||
el.style.height = 'auto'; // Expands collapsed elements
|
||||
el.style.width = 'auto'; // Ensures full visibility
|
||||
el.removeAttribute('hidden'); // Removes hidden attribute
|
||||
el.classList.remove('hidden', 'd-none'); // Removes common CSS hidden classes
|
||||
})
|
||||
""")
|
||||
|
||||
self.safe_page_operation(make_visible_operation)
|
||||
|
||||
# Responsible for maintaining a live 'context' with the chrome CDP
|
||||
# @todo - how long do contexts live for anyway?
|
||||
@@ -224,7 +367,9 @@ class browsersteps_live_ui(steppable_browser_interface):
|
||||
# bump and kill this if idle after X sec
|
||||
age_start = 0
|
||||
headers = {}
|
||||
|
||||
# Track if resources are properly cleaned up
|
||||
_is_cleaned_up = False
|
||||
|
||||
# use a special driver, maybe locally etc
|
||||
command_executor = os.getenv(
|
||||
"PLAYWRIGHT_BROWSERSTEPS_DRIVER_URL"
|
||||
@@ -243,9 +388,14 @@ class browsersteps_live_ui(steppable_browser_interface):
|
||||
self.age_start = time.time()
|
||||
self.playwright_browser = playwright_browser
|
||||
self.start_url = start_url
|
||||
self._is_cleaned_up = False
|
||||
if self.context is None:
|
||||
self.connect(proxy=proxy)
|
||||
|
||||
def __del__(self):
|
||||
# Ensure cleanup happens if object is garbage collected
|
||||
self.cleanup()
|
||||
|
||||
# Connect and setup a new context
|
||||
def connect(self, proxy=None):
|
||||
# Should only get called once - test that
|
||||
@@ -264,64 +414,131 @@ class browsersteps_live_ui(steppable_browser_interface):
|
||||
user_agent=manage_user_agent(headers=self.headers),
|
||||
)
|
||||
|
||||
|
||||
self.page = self.context.new_page()
|
||||
|
||||
# self.page.set_default_navigation_timeout(keep_open)
|
||||
self.page.set_default_timeout(keep_open)
|
||||
# @todo probably this doesnt work
|
||||
self.page.on(
|
||||
"close",
|
||||
self.mark_as_closed,
|
||||
)
|
||||
# Set event handlers
|
||||
self.page.on("close", self.mark_as_closed)
|
||||
# Listen for all console events and handle errors
|
||||
self.page.on("console", lambda msg: print(f"Browser steps console - {msg.type}: {msg.text} {msg.args}"))
|
||||
|
||||
logger.debug(f"Time to browser setup {time.time()-now:.2f}s")
|
||||
self.page.wait_for_timeout(1 * 1000)
|
||||
|
||||
|
||||
def mark_as_closed(self):
|
||||
logger.debug("Page closed, cleaning up..")
|
||||
self.cleanup()
|
||||
|
||||
def cleanup(self):
|
||||
"""Properly clean up all resources to prevent memory leaks"""
|
||||
if self._is_cleaned_up:
|
||||
return
|
||||
|
||||
logger.debug("Cleaning up browser steps resources")
|
||||
|
||||
# Clean up page
|
||||
if hasattr(self, 'page') and self.page is not None:
|
||||
try:
|
||||
# Force garbage collection before closing
|
||||
self.page.request_gc()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error during page garbage collection: {str(e)}")
|
||||
|
||||
try:
|
||||
# Remove event listeners before closing
|
||||
self.page.remove_listener("close", self.mark_as_closed)
|
||||
except Exception as e:
|
||||
logger.debug(f"Error removing event listeners: {str(e)}")
|
||||
|
||||
try:
|
||||
self.page.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error closing page: {str(e)}")
|
||||
|
||||
self.page = None
|
||||
|
||||
# Clean up context
|
||||
if hasattr(self, 'context') and self.context is not None:
|
||||
try:
|
||||
self.context.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error closing context: {str(e)}")
|
||||
|
||||
self.context = None
|
||||
|
||||
self._is_cleaned_up = True
|
||||
logger.debug("Browser steps resources cleanup complete")
|
||||
|
||||
@property
|
||||
def has_expired(self):
|
||||
if not self.page:
|
||||
if not self.page or self._is_cleaned_up:
|
||||
return True
|
||||
|
||||
|
||||
# Check if session has expired based on age
|
||||
max_age_seconds = int(os.getenv("BROWSER_STEPS_MAX_AGE_SECONDS", 60 * 10)) # Default 10 minutes
|
||||
if (time.time() - self.age_start) > max_age_seconds:
|
||||
logger.debug(f"Browser steps session expired after {max_age_seconds} seconds")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_current_state(self):
|
||||
"""Return the screenshot and interactive elements mapping, generally always called after action_()"""
|
||||
import importlib.resources
|
||||
import json
|
||||
# because we for now only run browser steps in playwright mode (not puppeteer mode)
|
||||
from changedetectionio.content_fetchers.playwright import capture_full_page
|
||||
|
||||
# Safety check - don't proceed if resources are cleaned up
|
||||
if self._is_cleaned_up or self.page is None:
|
||||
logger.warning("Attempted to get current state after cleanup")
|
||||
return (None, None)
|
||||
|
||||
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
|
||||
|
||||
now = time.time()
|
||||
self.page.wait_for_timeout(1 * 1000)
|
||||
|
||||
screenshot = None
|
||||
xpath_data = None
|
||||
|
||||
try:
|
||||
# Get screenshot first
|
||||
screenshot = capture_full_page(page=self.page)
|
||||
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")
|
||||
|
||||
full_height = self.page.evaluate("document.documentElement.scrollHeight")
|
||||
# Then get interactive elements
|
||||
now = time.time()
|
||||
self.page.evaluate("var include_filters=''")
|
||||
self.page.request_gc()
|
||||
|
||||
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
|
||||
logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
|
||||
screenshot = capture_stitched_together_full_page(self.page)
|
||||
else:
|
||||
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
|
||||
scan_elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
|
||||
|
||||
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
xpath_data = json.loads(self.page.evaluate(xpath_element_js, {
|
||||
"visualselector_xpath_selectors": scan_elements,
|
||||
"max_height": MAX_TOTAL_HEIGHT
|
||||
}))
|
||||
self.page.request_gc()
|
||||
|
||||
now = time.time()
|
||||
self.page.evaluate("var include_filters=''")
|
||||
# Go find the interactive elements
|
||||
# @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
|
||||
elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
|
||||
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)
|
||||
|
||||
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
|
||||
# So the JS will find the smallest one first
|
||||
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
|
||||
logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s")
|
||||
|
||||
# playwright._impl._api_types.Error: Browser closed.
|
||||
# @todo show some countdown timer?
|
||||
# Sort elements by size
|
||||
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
|
||||
logger.debug(f"Time to scrape xPath element data in browser {time.time()-now:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting current state: {str(e)}")
|
||||
# Attempt recovery - force garbage collection
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Request garbage collection one final time
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
return (screenshot, xpath_data)
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
<li class="tab"><a href="#notifications">Notifications</a></li>
|
||||
<li class="tab"><a href="#fetching">Fetching</a></li>
|
||||
<li class="tab"><a href="#filters">Global Filters</a></li>
|
||||
<li class="tab"><a href="#ui-options">UI Options</a></li>
|
||||
<li class="tab"><a href="#api">API</a></li>
|
||||
<li class="tab"><a href="#timedate">Time & Date</a></li>
|
||||
<li class="tab"><a href="#proxies">CAPTCHA & Proxies</a></li>
|
||||
@@ -240,6 +241,12 @@ nav
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="tab-pane-inner" id="ui-options">
|
||||
<div class="pure-control-group">
|
||||
{{ render_checkbox_field(form.application.form.ui.form.open_diff_in_new_tab, class="open_diff_in_new_tab") }}
|
||||
<span class="pure-form-message-inline">Enable this setting to open the diff page in a new tab. If disabled, the diff page will open in the current tab.</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="tab-pane-inner" id="proxies">
|
||||
<div id="recommended-proxy">
|
||||
<div>
|
||||
|
||||
@@ -104,6 +104,9 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
uuid = list(datastore.data['settings']['application']['tags'].keys()).pop()
|
||||
|
||||
default = datastore.data['settings']['application']['tags'].get(uuid)
|
||||
if not default:
|
||||
flash("Tag not found", "error")
|
||||
return redirect(url_for('watchlist.index'))
|
||||
|
||||
form = group_restock_settings_form(
|
||||
formdata=request.form if request.method == 'POST' else None,
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
/*const email_notification_prefix=JSON.parse('{{ emailprefix|tojson }}');*/
|
||||
/*{% endif %}*/
|
||||
|
||||
{% set has_tag_filters_extra='' %}
|
||||
|
||||
</script>
|
||||
|
||||
@@ -46,59 +47,12 @@
|
||||
</div>
|
||||
|
||||
<div class="tab-pane-inner" id="filters-and-triggers">
|
||||
<div class="pure-control-group">
|
||||
{% set field = render_field(form.include_filters,
|
||||
rows=5,
|
||||
placeholder="#example
|
||||
xpath://body/div/span[contains(@class, 'example-class')]",
|
||||
class="m-d")
|
||||
%}
|
||||
{{ field }}
|
||||
{% if '/text()' in field %}
|
||||
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br>
|
||||
{% endif %}
|
||||
<span class="pure-form-message-inline">One CSS, xPath, JSON Path/JQ selector per line, <i>any</i> rules that matches will be used.<br>
|
||||
<div data-target="#advanced-help-selectors" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div>
|
||||
<ul id="advanced-help-selectors">
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
|
||||
<ul>
|
||||
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
|
||||
{% if jq_support %}
|
||||
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>. Prefix <code>jqraw:</code> outputs the results as text instead of a JSON list.</li>
|
||||
{% else %}
|
||||
<li>jq support not installed</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</li>
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with <code>xpath:</code>
|
||||
<ul>
|
||||
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath:count(//*[contains(@class, 'sametext')])</code>, <a
|
||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
|
||||
<li>To use XPath1.0: Prefix with <code>xpath1:</code></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a
|
||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br>
|
||||
</span>
|
||||
</div>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_field(form.subtractive_selectors, rows=5, placeholder="header
|
||||
footer
|
||||
nav
|
||||
.stockticker
|
||||
//*[contains(text(), 'Advertisement')]") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
|
||||
<li> Don't paste HTML here, use only CSS and XPath selectors </li>
|
||||
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
|
||||
</ul>
|
||||
</span>
|
||||
</fieldset>
|
||||
|
||||
<p>These settings are <strong><i>added</i></strong> to any existing watch configurations.</p>
|
||||
{% include "edit/include_subtract.html" %}
|
||||
<div class="text-filtering border-fieldset">
|
||||
<h3>Text filtering</h3>
|
||||
{% include "edit/text-options.html" %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{# rendered sub Template #}
|
||||
@@ -112,7 +66,7 @@ nav
|
||||
<div class="pure-control-group inline-radio">
|
||||
{{ render_checkbox_field(form.notification_muted) }}
|
||||
</div>
|
||||
{% if is_html_webdriver %}
|
||||
{% if 1 %}
|
||||
<div class="pure-control-group inline-radio">
|
||||
{{ render_checkbox_field(form.notification_screenshot) }}
|
||||
<span class="pure-form-message-inline">
|
||||
|
||||
@@ -125,7 +125,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, running_updat
|
||||
|
||||
else:
|
||||
# Recheck all, including muted
|
||||
for watch_uuid, watch in datastore.data['watching'].items():
|
||||
# Get most overdue first
|
||||
for k in sorted(datastore.data['watching'].items(), key=lambda item: item[1].get('last_checked', 0)):
|
||||
watch_uuid = k[0]
|
||||
watch = k[1]
|
||||
if not watch['paused']:
|
||||
if watch_uuid not in running_uuids:
|
||||
if with_errors and not watch.get('last_error'):
|
||||
@@ -140,7 +143,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, running_updat
|
||||
if i == 1:
|
||||
flash("Queued 1 watch for rechecking.")
|
||||
if i > 1:
|
||||
flash("Queued {} watches for rechecking.".format(i))
|
||||
flash(f"Queued {i} watches for rechecking.")
|
||||
if i == 0:
|
||||
flash("No watches available to recheck.")
|
||||
|
||||
|
||||
@@ -213,9 +213,6 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
||||
if request.method == 'POST' and not form.validate():
|
||||
flash("An error occurred, please see below.", "error")
|
||||
|
||||
visualselector_data_is_ready = datastore.visualselector_data_is_ready(uuid)
|
||||
|
||||
|
||||
# JQ is difficult to install on windows and must be manually added (outside requirements.txt)
|
||||
jq_support = True
|
||||
try:
|
||||
@@ -225,16 +222,20 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
||||
|
||||
watch = datastore.data['watching'].get(uuid)
|
||||
|
||||
# if system or watch is configured to need a chrome type browser
|
||||
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
|
||||
|
||||
watch_uses_webdriver = False
|
||||
watch_needs_selenium_or_playwright = False
|
||||
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
|
||||
watch_uses_webdriver = True
|
||||
watch_needs_selenium_or_playwright = True
|
||||
|
||||
|
||||
from zoneinfo import available_timezones
|
||||
|
||||
# Only works reliably with Playwright
|
||||
|
||||
# Import the global plugin system
|
||||
from changedetectionio.pluggy_interface import collect_ui_edit_stats_extras
|
||||
|
||||
template_args = {
|
||||
'available_processors': processors.available_processors(),
|
||||
'available_timezones': sorted(available_timezones()),
|
||||
@@ -247,14 +248,18 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
||||
'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False,
|
||||
'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
|
||||
'has_special_tag_options': _watch_has_tag_options_set(watch=watch),
|
||||
'watch_uses_webdriver': watch_uses_webdriver,
|
||||
'jq_support': jq_support,
|
||||
'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False),
|
||||
'settings_application': datastore.data['settings']['application'],
|
||||
'system_has_playwright_configured': os.getenv('PLAYWRIGHT_DRIVER_URL'),
|
||||
'system_has_webdriver_configured': os.getenv('WEBDRIVER_URL'),
|
||||
'ui_edit_stats_extras': collect_ui_edit_stats_extras(watch),
|
||||
'visual_selector_data_ready': datastore.visualselector_data_is_ready(watch_uuid=uuid),
|
||||
'timezone_default_config': datastore.data['settings']['application'].get('timezone'),
|
||||
'using_global_webdriver_wait': not default['webdriver_delay'],
|
||||
'uuid': uuid,
|
||||
'watch': watch
|
||||
'watch': watch,
|
||||
'watch_needs_selenium_or_playwright': watch_needs_selenium_or_playwright,
|
||||
}
|
||||
|
||||
included_content = None
|
||||
|
||||
@@ -4,7 +4,6 @@ from loguru import logger
|
||||
|
||||
from changedetectionio.store import ChangeDetectionStore
|
||||
from changedetectionio.auth_decorator import login_optionally_required
|
||||
from changedetectionio.notification import process_notification
|
||||
|
||||
def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
notification_blueprint = Blueprint('ui_notification', __name__, template_folder="../ui/templates")
|
||||
@@ -18,8 +17,11 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
|
||||
# Watch_uuid could be unset in the case it`s used in tag editor, global settings
|
||||
import apprise
|
||||
from ...apprise_plugin.assets import apprise_asset
|
||||
from ...apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
|
||||
from changedetectionio.notification.handler import process_notification
|
||||
from changedetectionio.notification.apprise_plugin.assets import apprise_asset
|
||||
|
||||
from changedetectionio.notification.apprise_plugin.custom_handlers import apprise_http_custom_handler
|
||||
|
||||
apobj = apprise.Apprise(asset=apprise_asset)
|
||||
|
||||
is_global_settings_form = request.args.get('mode', '') == 'global-settings'
|
||||
|
||||
@@ -209,15 +209,18 @@
|
||||
<a href="{{ url_for('ui.ui_edit.edit_page', uuid=watch.uuid, tag=active_tag_uuid)}}#general" class="pure-button pure-button-primary">Edit</a>
|
||||
{% if watch.history_n >= 2 %}
|
||||
|
||||
{% set open_diff_in_new_tab = datastore.data['settings']['application']['ui'].get('open_diff_in_new_tab') %}
|
||||
{% set target_attr = ' target="' ~ watch.uuid ~ '"' if open_diff_in_new_tab else '' %}
|
||||
|
||||
{% if is_unviewed %}
|
||||
<a href="{{ url_for('ui.ui_views.diff_history_page', uuid=watch.uuid, from_version=watch.get_from_version_based_on_last_viewed) }}" target="{{watch.uuid}}" class="pure-button pure-button-primary diff-link">History</a>
|
||||
<a href="{{ url_for('ui.ui_views.diff_history_page', uuid=watch.uuid, from_version=watch.get_from_version_based_on_last_viewed) }}" {{target_attr}} class="pure-button pure-button-primary diff-link">History</a>
|
||||
{% else %}
|
||||
<a href="{{ url_for('ui.ui_views.diff_history_page', uuid=watch.uuid)}}" target="{{watch.uuid}}" class="pure-button pure-button-primary diff-link">History</a>
|
||||
<a href="{{ url_for('ui.ui_views.diff_history_page', uuid=watch.uuid)}}" {{target_attr}} class="pure-button pure-button-primary diff-link">History</a>
|
||||
{% endif %}
|
||||
|
||||
{% else %}
|
||||
{% if watch.history_n == 1 or (watch.history_n ==0 and watch.error_text_ctime )%}
|
||||
<a href="{{ url_for('ui.ui_views.preview_page', uuid=watch.uuid)}}" target="{{watch.uuid}}" class="pure-button pure-button-primary">Preview</a>
|
||||
<a href="{{ url_for('ui.ui_views.preview_page', uuid=watch.uuid)}}" {{target_attr}} class="pure-button pure-button-primary">Preview</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</td>
|
||||
|
||||
@@ -94,20 +94,39 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
|
||||
EXECUTE_DATA = {}
|
||||
result = True
|
||||
|
||||
ruleset_settings = application_datastruct['watching'].get(current_watch_uuid)
|
||||
watch = application_datastruct['watching'].get(current_watch_uuid)
|
||||
|
||||
if ruleset_settings.get("conditions"):
|
||||
logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or"
|
||||
complete_rules = filter_complete_rules(ruleset_settings['conditions'])
|
||||
if watch and watch.get("conditions"):
|
||||
logic_operator = "and" if watch.get("conditions_match_logic", "ALL") == "ALL" else "or"
|
||||
complete_rules = filter_complete_rules(watch['conditions'])
|
||||
if complete_rules:
|
||||
# Give all plugins a chance to update the data dict again (that we will test the conditions against)
|
||||
for plugin in plugin_manager.get_plugins():
|
||||
new_execute_data = plugin.add_data(current_watch_uuid=current_watch_uuid,
|
||||
application_datastruct=application_datastruct,
|
||||
ephemeral_data=ephemeral_data)
|
||||
|
||||
if new_execute_data and isinstance(new_execute_data, dict):
|
||||
EXECUTE_DATA.update(new_execute_data)
|
||||
try:
|
||||
import concurrent.futures
|
||||
import time
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(
|
||||
plugin.add_data,
|
||||
current_watch_uuid=current_watch_uuid,
|
||||
application_datastruct=application_datastruct,
|
||||
ephemeral_data=ephemeral_data
|
||||
)
|
||||
|
||||
# Set a timeout of 10 seconds
|
||||
try:
|
||||
new_execute_data = future.result(timeout=10)
|
||||
if new_execute_data and isinstance(new_execute_data, dict):
|
||||
EXECUTE_DATA.update(new_execute_data)
|
||||
except concurrent.futures.TimeoutError:
|
||||
# The plugin took too long, abort processing for this watch
|
||||
raise Exception(f"Plugin {plugin.__class__.__name__} took more than 10 seconds to run.")
|
||||
except Exception as e:
|
||||
# Log the error but continue with the next plugin
|
||||
import logging
|
||||
logging.error(f"Error executing plugin {plugin.__class__.__name__}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Create the ruleset
|
||||
ruleset = convert_to_jsonlogic(logic_operator=logic_operator, rule_dict=complete_rules)
|
||||
@@ -132,3 +151,18 @@ for plugin in plugin_manager.get_plugins():
|
||||
if isinstance(new_field_choices, list):
|
||||
field_choices.extend(new_field_choices)
|
||||
|
||||
def collect_ui_edit_stats_extras(watch):
|
||||
"""Collect and combine HTML content from all plugins that implement ui_edit_stats_extras"""
|
||||
extras_content = []
|
||||
|
||||
for plugin in plugin_manager.get_plugins():
|
||||
try:
|
||||
content = plugin.ui_edit_stats_extras(watch=watch)
|
||||
if content:
|
||||
extras_content.append(content)
|
||||
except Exception as e:
|
||||
# Skip plugins that don't implement the hook or have errors
|
||||
pass
|
||||
|
||||
return "\n".join(extras_content) if extras_content else ""
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
import pluggy
|
||||
from . import default_plugin # Import the default plugin
|
||||
import os
|
||||
import importlib
|
||||
import sys
|
||||
from . import default_plugin
|
||||
|
||||
# ✅ Ensure that the namespace in HookspecMarker matches PluginManager
|
||||
PLUGIN_NAMESPACE = "changedetectionio_conditions"
|
||||
@@ -30,6 +33,11 @@ class ConditionsSpec:
|
||||
def add_data(current_watch_uuid, application_datastruct, ephemeral_data):
|
||||
"""Add to the datadict"""
|
||||
pass
|
||||
|
||||
@hookspec
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Return HTML content to add to the stats tab in the edit view"""
|
||||
pass
|
||||
|
||||
# ✅ Set up Pluggy Plugin Manager
|
||||
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
|
||||
@@ -40,5 +48,27 @@ plugin_manager.add_hookspecs(ConditionsSpec)
|
||||
# ✅ Register built-in plugins manually
|
||||
plugin_manager.register(default_plugin, "default_plugin")
|
||||
|
||||
# ✅ Load plugins from the plugins directory
|
||||
def load_plugins_from_directory():
|
||||
plugins_dir = os.path.join(os.path.dirname(__file__), 'plugins')
|
||||
if not os.path.exists(plugins_dir):
|
||||
return
|
||||
|
||||
# Get all Python files (excluding __init__.py)
|
||||
for filename in os.listdir(plugins_dir):
|
||||
if filename.endswith(".py") and filename != "__init__.py":
|
||||
module_name = filename[:-3] # Remove .py extension
|
||||
module_path = f"changedetectionio.conditions.plugins.{module_name}"
|
||||
|
||||
try:
|
||||
module = importlib.import_module(module_path)
|
||||
# Register the plugin with pluggy
|
||||
plugin_manager.register(module, module_name)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Error loading plugin {module_name}: {e}")
|
||||
|
||||
# Load plugins from the plugins directory
|
||||
load_plugins_from_directory()
|
||||
|
||||
# ✅ Discover installed plugins from external packages (if any)
|
||||
plugin_manager.load_setuptools_entrypoints(PLUGIN_NAMESPACE)
|
||||
|
||||
1
changedetectionio/conditions/plugins/__init__.py
Normal file
1
changedetectionio/conditions/plugins/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Import plugins package to make them discoverable
|
||||
102
changedetectionio/conditions/plugins/levenshtein_plugin.py
Normal file
102
changedetectionio/conditions/plugins/levenshtein_plugin.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import pluggy
|
||||
from loguru import logger
|
||||
|
||||
# Support both plugin systems
|
||||
conditions_hookimpl = pluggy.HookimplMarker("changedetectionio_conditions")
|
||||
global_hookimpl = pluggy.HookimplMarker("changedetectionio")
|
||||
|
||||
def levenshtein_ratio_recent_history(watch, incoming_text=None):
|
||||
try:
|
||||
from Levenshtein import ratio, distance
|
||||
k = list(watch.history.keys())
|
||||
if len(k) >= 2:
|
||||
# When called from ui_edit_stats_extras, we don't have incoming_text
|
||||
if incoming_text is None:
|
||||
a = watch.get_history_snapshot(timestamp=k[-1]) # Latest snapshot
|
||||
b = watch.get_history_snapshot(timestamp=k[-2]) # Previous snapshot
|
||||
else:
|
||||
a = watch.get_history_snapshot(timestamp=k[-2]) # Second newest, incoming_text will be "newest"
|
||||
b = incoming_text
|
||||
|
||||
distance_value = distance(a, b)
|
||||
ratio_value = ratio(a, b)
|
||||
return {
|
||||
'distance': distance_value,
|
||||
'ratio': ratio_value,
|
||||
'percent_similar': round(ratio_value * 100, 2)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to calc similarity: {str(e)}")
|
||||
|
||||
return ''
|
||||
|
||||
@conditions_hookimpl
|
||||
def register_operators():
|
||||
pass
|
||||
|
||||
@conditions_hookimpl
|
||||
def register_operator_choices():
|
||||
pass
|
||||
|
||||
|
||||
@conditions_hookimpl
|
||||
def register_field_choices():
|
||||
return [
|
||||
("levenshtein_ratio", "Levenshtein - Text similarity ratio"),
|
||||
("levenshtein_distance", "Levenshtein - Text change distance"),
|
||||
]
|
||||
|
||||
@conditions_hookimpl
|
||||
def add_data(current_watch_uuid, application_datastruct, ephemeral_data):
|
||||
res = {}
|
||||
watch = application_datastruct['watching'].get(current_watch_uuid)
|
||||
# ephemeral_data['text'] will be the current text after filters, they may have edited filters but not saved them yet etc
|
||||
|
||||
if watch and 'text' in ephemeral_data:
|
||||
lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data['text'])
|
||||
if isinstance(lev_data, dict):
|
||||
res['levenshtein_ratio'] = lev_data.get('ratio', 0)
|
||||
res['levenshtein_similarity'] = lev_data.get('percent_similar', 0)
|
||||
res['levenshtein_distance'] = lev_data.get('distance', 0)
|
||||
|
||||
return res
|
||||
|
||||
@global_hookimpl
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Add Levenshtein stats to the UI using the global plugin system"""
|
||||
"""Generate the HTML for Levenshtein stats - shared by both plugin systems"""
|
||||
if len(watch.history.keys()) < 2:
|
||||
return "<p>Not enough history to calculate Levenshtein metrics</p>"
|
||||
|
||||
try:
|
||||
lev_data = levenshtein_ratio_recent_history(watch)
|
||||
if not lev_data or not isinstance(lev_data, dict):
|
||||
return "<p>Unable to calculate Levenshtein metrics</p>"
|
||||
|
||||
html = f"""
|
||||
<div class="levenshtein-stats">
|
||||
<h4>Levenshtein Text Similarity Details</h4>
|
||||
<table class="pure-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Raw distance (edits needed)</td>
|
||||
<td>{lev_data['distance']}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Similarity ratio</td>
|
||||
<td>{lev_data['ratio']:.4f}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Percent similar</td>
|
||||
<td>{lev_data['percent_similar']}%</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p style="font-size: 80%;">Levenshtein metrics compare the last two snapshots, measuring how many character edits are needed to transform one into the other.</p>
|
||||
</div>
|
||||
"""
|
||||
return html
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating Levenshtein UI extras: {str(e)}")
|
||||
return "<p>Error calculating Levenshtein metrics</p>"
|
||||
|
||||
82
changedetectionio/conditions/plugins/wordcount_plugin.py
Normal file
82
changedetectionio/conditions/plugins/wordcount_plugin.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import pluggy
|
||||
from loguru import logger
|
||||
|
||||
# Support both plugin systems
|
||||
conditions_hookimpl = pluggy.HookimplMarker("changedetectionio_conditions")
|
||||
global_hookimpl = pluggy.HookimplMarker("changedetectionio")
|
||||
|
||||
def count_words_in_history(watch, incoming_text=None):
|
||||
"""Count words in snapshot text"""
|
||||
try:
|
||||
if incoming_text is not None:
|
||||
# When called from add_data with incoming text
|
||||
return len(incoming_text.split())
|
||||
elif watch.history.keys():
|
||||
# When called from UI extras to count latest snapshot
|
||||
latest_key = list(watch.history.keys())[-1]
|
||||
latest_content = watch.get_history_snapshot(latest_key)
|
||||
return len(latest_content.split())
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.error(f"Error counting words: {str(e)}")
|
||||
return 0
|
||||
|
||||
# Implement condition plugin hooks
|
||||
@conditions_hookimpl
|
||||
def register_operators():
|
||||
# No custom operators needed
|
||||
return {}
|
||||
|
||||
@conditions_hookimpl
|
||||
def register_operator_choices():
|
||||
# No custom operator choices needed
|
||||
return []
|
||||
|
||||
@conditions_hookimpl
|
||||
def register_field_choices():
|
||||
# Add a field that will be available in conditions
|
||||
return [
|
||||
("word_count", "Word count of content"),
|
||||
]
|
||||
|
||||
@conditions_hookimpl
|
||||
def add_data(current_watch_uuid, application_datastruct, ephemeral_data):
|
||||
"""Add word count data for conditions"""
|
||||
result = {}
|
||||
watch = application_datastruct['watching'].get(current_watch_uuid)
|
||||
|
||||
if watch and 'text' in ephemeral_data:
|
||||
word_count = count_words_in_history(watch, ephemeral_data['text'])
|
||||
result['word_count'] = word_count
|
||||
|
||||
return result
|
||||
|
||||
def _generate_stats_html(watch):
|
||||
"""Generate the HTML content for the stats tab"""
|
||||
word_count = count_words_in_history(watch)
|
||||
|
||||
html = f"""
|
||||
<div class="word-count-stats">
|
||||
<h4>Content Analysis</h4>
|
||||
<table class="pure-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Word count (latest snapshot)</td>
|
||||
<td>{word_count}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p style="font-size: 80%;">Word count is a simple measure of content length, calculated by splitting text on whitespace.</p>
|
||||
</div>
|
||||
"""
|
||||
return html
|
||||
|
||||
@conditions_hookimpl
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Add word count stats to the UI through conditions plugin system"""
|
||||
return _generate_stats_html(watch)
|
||||
|
||||
@global_hookimpl
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Add word count stats to the UI using the global plugin system"""
|
||||
return _generate_stats_html(watch)
|
||||
@@ -7,11 +7,29 @@ import os
|
||||
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
|
||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
|
||||
|
||||
SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
|
||||
SCREENSHOT_DEFAULT_QUALITY = 40
|
||||
|
||||
# Maximum total height for the final image (When in stitch mode).
|
||||
# We limit this to 16000px due to the huge amount of RAM that was being used
|
||||
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
|
||||
SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
|
||||
# The size at which we will switch to stitching method, when below this (and
|
||||
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
|
||||
# screenshot method.
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000
|
||||
|
||||
# available_fetchers() will scan this implementation looking for anything starting with html_
|
||||
# this information is used in the form selections
|
||||
from changedetectionio.content_fetchers.requests import fetcher as html_requests
|
||||
|
||||
|
||||
import importlib.resources
|
||||
XPATH_ELEMENT_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
|
||||
INSTOCK_DATA_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
|
||||
|
||||
|
||||
def available_fetchers():
|
||||
# See the if statement at the bottom of this file for how we switch between playwright and webdriver
|
||||
import inspect
|
||||
|
||||
@@ -63,11 +63,6 @@ class Fetcher():
|
||||
# Time ONTOP of the system defined env minimum time
|
||||
render_extract_delay = 0
|
||||
|
||||
def __init__(self):
|
||||
import importlib.resources
|
||||
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
|
||||
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
|
||||
|
||||
@abstractmethod
|
||||
def get_error(self):
|
||||
return self.error
|
||||
@@ -87,7 +82,7 @@ class Fetcher():
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def quit(self):
|
||||
def quit(self, watch=None):
|
||||
return
|
||||
|
||||
@abstractmethod
|
||||
@@ -143,6 +138,7 @@ class Fetcher():
|
||||
logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...")
|
||||
self.screenshot_step("before-" + str(step_n))
|
||||
self.save_step_html("before-" + str(step_n))
|
||||
|
||||
try:
|
||||
optional_value = step['optional_value']
|
||||
selector = step['selector']
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
|
||||
# Pages with a vertical height longer than this will use the 'stitch together' method.
|
||||
|
||||
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
|
||||
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
|
||||
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
|
||||
|
||||
|
||||
# The size at which we will switch to stitching method
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD=8000
|
||||
|
||||
from loguru import logger
|
||||
|
||||
def capture_stitched_together_full_page(page):
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode)
|
||||
MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk
|
||||
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay
|
||||
|
||||
# Save the original viewport size
|
||||
original_viewport = page.viewport_size
|
||||
now = time.time()
|
||||
|
||||
try:
|
||||
viewport = page.viewport_size
|
||||
page_height = page.evaluate("document.documentElement.scrollHeight")
|
||||
|
||||
# Limit the total capture height
|
||||
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
|
||||
|
||||
images = []
|
||||
total_captured_height = 0
|
||||
|
||||
for offset in range(0, capture_height, MAX_CHUNK_HEIGHT):
|
||||
# Ensure we do not exceed the total height limit
|
||||
chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height)
|
||||
|
||||
# Adjust viewport size for this chunk
|
||||
page.set_viewport_size({"width": viewport["width"], "height": chunk_height})
|
||||
|
||||
# Scroll to the correct position
|
||||
page.evaluate(f"window.scrollTo(0, {offset})")
|
||||
|
||||
# Capture screenshot chunk
|
||||
screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
|
||||
images.append(Image.open(io.BytesIO(screenshot_bytes)))
|
||||
|
||||
total_captured_height += chunk_height
|
||||
|
||||
# Stop if we reached the maximum total height
|
||||
if total_captured_height >= MAX_TOTAL_HEIGHT:
|
||||
break
|
||||
|
||||
# Create the final stitched image
|
||||
stitched_image = Image.new('RGB', (viewport["width"], total_captured_height))
|
||||
y_offset = 0
|
||||
|
||||
# Stitch the screenshot chunks together
|
||||
for img in images:
|
||||
stitched_image.paste(img, (0, y_offset))
|
||||
y_offset += img.height
|
||||
|
||||
logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s")
|
||||
|
||||
# Overlay warning text if the screenshot was trimmed
|
||||
if page_height > MAX_TOTAL_HEIGHT:
|
||||
draw = ImageDraw.Draw(stitched_image)
|
||||
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"
|
||||
|
||||
# Load font (default system font if Arial is unavailable)
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac)
|
||||
except IOError:
|
||||
font = ImageFont.load_default() # Default font if Arial not found
|
||||
|
||||
# Get text bounding box (correct method for newer Pillow versions)
|
||||
text_bbox = draw.textbbox((0, 0), warning_text, font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0] # Calculate text width
|
||||
text_height = text_bbox[3] - text_bbox[1] # Calculate text height
|
||||
|
||||
# Define background rectangle (top of the image)
|
||||
draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white")
|
||||
|
||||
# Center text horizontally within the warning area
|
||||
text_x = (viewport["width"] - text_width) // 2
|
||||
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2
|
||||
|
||||
# Draw the warning text in red
|
||||
draw.text((text_x, text_y), warning_text, fill="red", font=font)
|
||||
|
||||
# Save or return the final image
|
||||
output = io.BytesIO()
|
||||
stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
|
||||
screenshot = output.getvalue()
|
||||
|
||||
finally:
|
||||
# Restore the original viewport size
|
||||
page.set_viewport_size(original_viewport)
|
||||
|
||||
return screenshot
|
||||
@@ -4,10 +4,76 @@ from urllib.parse import urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
|
||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
|
||||
|
||||
def capture_full_page(page):
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
start = time.time()
|
||||
|
||||
page_height = page.evaluate("document.documentElement.scrollHeight")
|
||||
page_width = page.evaluate("document.documentElement.scrollWidth")
|
||||
original_viewport = page.viewport_size
|
||||
|
||||
logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")
|
||||
|
||||
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
|
||||
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
|
||||
screenshot_chunks = []
|
||||
y = 0
|
||||
|
||||
if page_height > page.viewport_size['height']:
|
||||
if page_height < step_size:
|
||||
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
|
||||
logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
|
||||
# Set viewport to a larger size to capture more content at once
|
||||
page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
|
||||
|
||||
# Capture screenshots in chunks up to the max total height
|
||||
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
||||
page.request_gc()
|
||||
page.evaluate(f"window.scrollTo(0, {y})")
|
||||
page.request_gc()
|
||||
screenshot_chunks.append(page.screenshot(
|
||||
type="jpeg",
|
||||
full_page=False,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
|
||||
))
|
||||
y += step_size
|
||||
page.request_gc()
|
||||
|
||||
# Restore original viewport size
|
||||
page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
|
||||
|
||||
# If we have multiple chunks, stitch them together
|
||||
if len(screenshot_chunks) > 1:
|
||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
||||
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
|
||||
p.start()
|
||||
screenshot = parent_conn.recv_bytes()
|
||||
p.join()
|
||||
logger.debug(
|
||||
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||
# Explicit cleanup
|
||||
del screenshot_chunks
|
||||
del p
|
||||
del parent_conn, child_conn
|
||||
screenshot_chunks = None
|
||||
return screenshot
|
||||
|
||||
logger.debug(
|
||||
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||
|
||||
return screenshot_chunks[0]
|
||||
|
||||
|
||||
class fetcher(Fetcher):
|
||||
fetcher_description = "Playwright {}/Javascript".format(
|
||||
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
||||
@@ -60,7 +126,8 @@ class fetcher(Fetcher):
|
||||
|
||||
def screenshot_step(self, step_n=''):
|
||||
super().screenshot_step(step_n=step_n)
|
||||
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
screenshot = capture_full_page(page=self.page)
|
||||
|
||||
|
||||
if self.browser_steps_screenshot_path is not None:
|
||||
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
|
||||
@@ -89,7 +156,6 @@ class fetcher(Fetcher):
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
import playwright._impl._errors
|
||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
||||
import time
|
||||
self.delete_browser_steps_screenshots()
|
||||
response = None
|
||||
@@ -164,9 +230,7 @@ class fetcher(Fetcher):
|
||||
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
||||
|
||||
if self.status_code != 200 and not ignore_status_codes:
|
||||
screenshot = self.page.screenshot(type='jpeg', full_page=True,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
|
||||
screenshot = capture_full_page(self.page)
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
||||
|
||||
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
|
||||
@@ -187,13 +251,23 @@ class fetcher(Fetcher):
|
||||
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
|
||||
else:
|
||||
self.page.evaluate("var include_filters=''")
|
||||
self.page.request_gc()
|
||||
|
||||
self.xpath_data = self.page.evaluate(
|
||||
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
|
||||
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
||||
# request_gc before and after evaluate to free up memory
|
||||
# @todo browsersteps etc
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
self.xpath_data = self.page.evaluate(XPATH_ELEMENT_JS, {
|
||||
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
||||
"max_height": MAX_TOTAL_HEIGHT
|
||||
})
|
||||
self.page.request_gc()
|
||||
|
||||
self.instock_data = self.page.evaluate(INSTOCK_DATA_JS)
|
||||
self.page.request_gc()
|
||||
|
||||
self.content = self.page.content()
|
||||
logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s")
|
||||
self.page.request_gc()
|
||||
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
|
||||
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
@@ -204,18 +278,41 @@ class fetcher(Fetcher):
|
||||
# acceptable screenshot quality here
|
||||
try:
|
||||
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
|
||||
full_height = self.page.evaluate("document.documentElement.scrollHeight")
|
||||
|
||||
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
|
||||
logger.warning(
|
||||
f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
|
||||
self.screenshot = capture_stitched_together_full_page(self.page)
|
||||
else:
|
||||
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
|
||||
self.screenshot = capture_full_page(page=self.page)
|
||||
|
||||
except Exception as e:
|
||||
# It's likely the screenshot was too long/big and something crashed
|
||||
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
||||
finally:
|
||||
context.close()
|
||||
browser.close()
|
||||
# Request garbage collection one more time before closing
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up resources properly
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
self.page.close()
|
||||
except:
|
||||
pass
|
||||
self.page = None
|
||||
|
||||
try:
|
||||
context.close()
|
||||
except:
|
||||
pass
|
||||
context = None
|
||||
|
||||
try:
|
||||
browser.close()
|
||||
except:
|
||||
pass
|
||||
browser = None
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -6,8 +6,77 @@ from urllib.parse import urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
||||
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
|
||||
SCREENSHOT_MAX_TOTAL_HEIGHT
|
||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
|
||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
|
||||
BrowserConnectError
|
||||
|
||||
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
|
||||
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
||||
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
||||
# acceptable screenshot quality here
|
||||
async def capture_full_page(page):
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
start = time.time()
|
||||
|
||||
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||
page_width = await page.evaluate("document.documentElement.scrollWidth")
|
||||
original_viewport = page.viewport
|
||||
|
||||
logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")
|
||||
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
# JPEG is better here because the screenshots can be very very large
|
||||
|
||||
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
||||
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
||||
# acceptable screenshot quality here
|
||||
|
||||
|
||||
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
|
||||
screenshot_chunks = []
|
||||
y = 0
|
||||
if page_height > page.viewport['height']:
|
||||
if page_height < step_size:
|
||||
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
|
||||
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
|
||||
|
||||
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
||||
await page.evaluate(f"window.scrollTo(0, {y})")
|
||||
screenshot_chunks.append(await page.screenshot(type_='jpeg',
|
||||
fullPage=False,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
|
||||
y += step_size
|
||||
|
||||
await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
|
||||
|
||||
if len(screenshot_chunks) > 1:
|
||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
|
||||
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
|
||||
p.start()
|
||||
screenshot = parent_conn.recv_bytes()
|
||||
p.join()
|
||||
logger.debug(
|
||||
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||
|
||||
screenshot_chunks = None
|
||||
return screenshot
|
||||
|
||||
logger.debug(
|
||||
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||
return screenshot_chunks[0]
|
||||
|
||||
|
||||
class fetcher(Fetcher):
|
||||
fetcher_description = "Puppeteer/direct {}/Javascript".format(
|
||||
@@ -79,7 +148,6 @@ class fetcher(Fetcher):
|
||||
empty_pages_are_a_change
|
||||
):
|
||||
|
||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
||||
self.delete_browser_steps_screenshots()
|
||||
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
||||
|
||||
@@ -181,11 +249,10 @@ class fetcher(Fetcher):
|
||||
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
||||
|
||||
if self.status_code != 200 and not ignore_status_codes:
|
||||
screenshot = await self.page.screenshot(type_='jpeg',
|
||||
fullPage=True,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
screenshot = await capture_full_page(page=self.page)
|
||||
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
||||
|
||||
content = await self.page.content
|
||||
|
||||
if not empty_pages_are_a_change and len(content.strip()) == 0:
|
||||
@@ -203,46 +270,31 @@ class fetcher(Fetcher):
|
||||
|
||||
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
||||
# Setup the xPath/VisualSelector scraper
|
||||
if current_include_filters is not None:
|
||||
if current_include_filters:
|
||||
js = json.dumps(current_include_filters)
|
||||
await self.page.evaluate(f"var include_filters={js}")
|
||||
else:
|
||||
await self.page.evaluate(f"var include_filters=''")
|
||||
|
||||
self.xpath_data = await self.page.evaluate(
|
||||
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
|
||||
self.instock_data = await self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
||||
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
|
||||
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
||||
"max_height": MAX_TOTAL_HEIGHT
|
||||
})
|
||||
if not self.xpath_data:
|
||||
raise Exception(f"Content Fetcher > xPath scraper failed. Please report this URL so we can fix it :)")
|
||||
|
||||
self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS)
|
||||
|
||||
self.content = await self.page.content
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
# JPEG is better here because the screenshots can be very very large
|
||||
|
||||
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
||||
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
||||
# acceptable screenshot quality here
|
||||
try:
|
||||
self.screenshot = await self.page.screenshot(type_='jpeg',
|
||||
fullPage=True,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
except Exception as e:
|
||||
logger.error("Error fetching screenshot")
|
||||
# // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
||||
# // @ todo after text extract, we can place some overlay text with red background to say 'croppped'
|
||||
logger.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot')
|
||||
try:
|
||||
self.screenshot = await self.page.screenshot(type_='jpeg',
|
||||
fullPage=False,
|
||||
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
||||
except Exception as e:
|
||||
logger.error('ERROR: Failed to get viewport-only reduced screenshot :(')
|
||||
pass
|
||||
finally:
|
||||
# It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
|
||||
logger.success(f"Fetching '{url}' complete, closing page")
|
||||
await self.page.close()
|
||||
logger.success(f"Fetching '{url}' complete, closing browser")
|
||||
await browser.close()
|
||||
self.screenshot = await capture_full_page(page=self.page)
|
||||
|
||||
# It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
|
||||
logger.success(f"Fetching '{url}' complete, closing page")
|
||||
await self.page.close()
|
||||
logger.success(f"Fetching '{url}' complete, closing browser")
|
||||
await browser.close()
|
||||
logger.success(f"Fetching '{url}' complete, exiting puppeteer fetch.")
|
||||
|
||||
async def main(self, **kwargs):
|
||||
|
||||
@@ -96,3 +96,17 @@ class fetcher(Fetcher):
|
||||
|
||||
|
||||
self.raw_content = r.content
|
||||
|
||||
def quit(self, watch=None):
|
||||
|
||||
# In case they switched to `requests` fetcher from something else
|
||||
# Then the screenshot could be old, in any case, it's not used here.
|
||||
# REMOVE_REQUESTS_OLD_SCREENSHOTS - Mainly used for testing
|
||||
if strtobool(os.getenv("REMOVE_REQUESTS_OLD_SCREENSHOTS", 'true')):
|
||||
screenshot = watch.get_screenshot()
|
||||
if screenshot:
|
||||
try:
|
||||
os.unlink(screenshot)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to unlink screenshot: {screenshot} - {e}")
|
||||
|
||||
|
||||
@@ -1,190 +0,0 @@
|
||||
module.exports = async ({page, context}) => {
|
||||
|
||||
var {
|
||||
url,
|
||||
execute_js,
|
||||
user_agent,
|
||||
extra_wait_ms,
|
||||
req_headers,
|
||||
include_filters,
|
||||
xpath_element_js,
|
||||
screenshot_quality,
|
||||
proxy_username,
|
||||
proxy_password,
|
||||
disk_cache_dir,
|
||||
no_cache_list,
|
||||
block_url_list,
|
||||
} = context;
|
||||
|
||||
await page.setBypassCSP(true)
|
||||
await page.setExtraHTTPHeaders(req_headers);
|
||||
|
||||
if (user_agent) {
|
||||
await page.setUserAgent(user_agent);
|
||||
}
|
||||
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
|
||||
|
||||
await page.setDefaultNavigationTimeout(0);
|
||||
|
||||
if (proxy_username) {
|
||||
// Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
||||
// https://github.com/puppeteer/puppeteer/issues/676 ?
|
||||
// https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
||||
// https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
|
||||
await page.authenticate({
|
||||
username: proxy_username,
|
||||
password: proxy_password
|
||||
});
|
||||
}
|
||||
|
||||
await page.setViewport({
|
||||
width: 1024,
|
||||
height: 768,
|
||||
deviceScaleFactor: 1,
|
||||
});
|
||||
|
||||
await page.setRequestInterception(true);
|
||||
if (disk_cache_dir) {
|
||||
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
|
||||
}
|
||||
const fs = require('fs');
|
||||
const crypto = require('crypto');
|
||||
|
||||
function file_is_expired(file_path) {
|
||||
if (!fs.existsSync(file_path)) {
|
||||
return true;
|
||||
}
|
||||
var stats = fs.statSync(file_path);
|
||||
const now_date = new Date();
|
||||
const expire_seconds = 300;
|
||||
if ((now_date / 1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {
|
||||
console.log("CACHE EXPIRED: " + file_path);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
page.on('request', async (request) => {
|
||||
// General blocking of requests that waste traffic
|
||||
if (block_url_list.some(substring => request.url().toLowerCase().includes(substring))) return request.abort();
|
||||
|
||||
if (disk_cache_dir) {
|
||||
const url = request.url();
|
||||
const key = crypto.createHash('md5').update(url).digest("hex");
|
||||
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
||||
|
||||
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
|
||||
|
||||
if (fs.existsSync(dir_path + key)) {
|
||||
console.log("* CACHE HIT , using - " + dir_path + key + " - " + url);
|
||||
const cached_data = fs.readFileSync(dir_path + key);
|
||||
// @todo headers can come from dir_path+key+".meta" json file
|
||||
request.respond({
|
||||
status: 200,
|
||||
//contentType: 'text/html', //@todo
|
||||
body: cached_data
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
request.continue();
|
||||
});
|
||||
|
||||
|
||||
if (disk_cache_dir) {
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
// Basic filtering for sane responses
|
||||
if (response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200) {
|
||||
console.log("Skipping (not useful) - Status:" + response.status() + " Method:" + response.request().method() + " ResourceType:" + response.request().resourceType() + " " + url);
|
||||
return;
|
||||
}
|
||||
if (no_cache_list.some(substring => url.toLowerCase().includes(substring))) {
|
||||
console.log("Skipping (no_cache_list) - " + url);
|
||||
return;
|
||||
}
|
||||
if (url.toLowerCase().includes('data:')) {
|
||||
console.log("Skipping (embedded-data) - " + url);
|
||||
return;
|
||||
}
|
||||
response.buffer().then(buffer => {
|
||||
if (buffer.length > 100) {
|
||||
console.log("Cache - Saving " + response.request().method() + " - " + url + " - " + response.request().resourceType());
|
||||
|
||||
const key = crypto.createHash('md5').update(url).digest("hex");
|
||||
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
||||
|
||||
if (!fs.existsSync(dir_path)) {
|
||||
fs.mkdirSync(dir_path, {recursive: true})
|
||||
}
|
||||
|
||||
if (fs.existsSync(dir_path + key)) {
|
||||
if (file_is_expired(dir_path + key)) {
|
||||
fs.writeFileSync(dir_path + key, buffer);
|
||||
}
|
||||
} else {
|
||||
fs.writeFileSync(dir_path + key, buffer);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const r = await page.goto(url, {
|
||||
waitUntil: 'load'
|
||||
});
|
||||
|
||||
await page.waitForTimeout(1000);
|
||||
await page.waitForTimeout(extra_wait_ms);
|
||||
|
||||
if (execute_js) {
|
||||
await page.evaluate(execute_js);
|
||||
await page.waitForTimeout(200);
|
||||
}
|
||||
|
||||
var xpath_data;
|
||||
var instock_data;
|
||||
try {
|
||||
// Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode
|
||||
// (Once the old playwright is removed)
|
||||
xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
|
||||
instock_data = await page.evaluate(() => {%instock_scrape_code%});
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
|
||||
// Wrap it here (for now)
|
||||
|
||||
var b64s = false;
|
||||
try {
|
||||
b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'});
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
||||
if (!b64s) {
|
||||
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
|
||||
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
|
||||
try {
|
||||
b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'});
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
}
|
||||
|
||||
var html = await page.content();
|
||||
return {
|
||||
data: {
|
||||
'content': html,
|
||||
'headers': r.headers(),
|
||||
'instock_data': instock_data,
|
||||
'screenshot': b64s,
|
||||
'status_code': r.status(),
|
||||
'xpath_data': xpath_data
|
||||
},
|
||||
type: 'application/json',
|
||||
};
|
||||
};
|
||||
@@ -1,229 +1,223 @@
|
||||
// Restock Detector
|
||||
// (c) Leigh Morresi dgtlmoon@gmail.com
|
||||
//
|
||||
// Assumes the product is in stock to begin with, unless the following appears above the fold ;
|
||||
// - outOfStockTexts appears above the fold (out of stock)
|
||||
// - negateOutOfStockRegex (really is in stock)
|
||||
async () => {
|
||||
|
||||
function isItemInStock() {
|
||||
// @todo Pass these in so the same list can be used in non-JS fetchers
|
||||
const outOfStockTexts = [
|
||||
' أخبرني عندما يتوفر',
|
||||
'0 in stock',
|
||||
'actuellement indisponible',
|
||||
'agotado',
|
||||
'article épuisé',
|
||||
'artikel zurzeit vergriffen',
|
||||
'as soon as stock is available',
|
||||
'ausverkauft', // sold out
|
||||
'available for back order',
|
||||
'awaiting stock',
|
||||
'back in stock soon',
|
||||
'back-order or out of stock',
|
||||
'backordered',
|
||||
'benachrichtigt mich', // notify me
|
||||
'brak na stanie',
|
||||
'brak w magazynie',
|
||||
'coming soon',
|
||||
'currently have any tickets for this',
|
||||
'currently unavailable',
|
||||
'dieser artikel ist bald wieder verfügbar',
|
||||
'dostępne wkrótce',
|
||||
'en rupture',
|
||||
'en rupture de stock',
|
||||
'épuisé',
|
||||
'esgotado',
|
||||
'indisponible',
|
||||
'indisponível',
|
||||
'isn\'t in stock right now',
|
||||
'isnt in stock right now',
|
||||
'isn’t in stock right now',
|
||||
'item is no longer available',
|
||||
'let me know when it\'s available',
|
||||
'mail me when available',
|
||||
'message if back in stock',
|
||||
'mevcut değil',
|
||||
'nachricht bei',
|
||||
'nicht auf lager',
|
||||
'nicht lagernd',
|
||||
'nicht lieferbar',
|
||||
'nicht verfügbar',
|
||||
'nicht vorrätig',
|
||||
'nicht zur verfügung',
|
||||
'nie znaleziono produktów',
|
||||
'niet beschikbaar',
|
||||
'niet leverbaar',
|
||||
'niet op voorraad',
|
||||
'no disponible',
|
||||
'non disponibile',
|
||||
'non disponible',
|
||||
'no longer in stock',
|
||||
'no tickets available',
|
||||
'not available',
|
||||
'not currently available',
|
||||
'not in stock',
|
||||
'notify me when available',
|
||||
'notify me',
|
||||
'notify when available',
|
||||
'não disponível',
|
||||
'não estamos a aceitar encomendas',
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'plus disponible',
|
||||
'prodotto esaurito',
|
||||
'produkt niedostępny',
|
||||
'rupture',
|
||||
'sold out',
|
||||
'sold-out',
|
||||
'stok habis',
|
||||
'stok kosong',
|
||||
'stok varian ini habis',
|
||||
'stokta yok',
|
||||
'temporarily out of stock',
|
||||
'temporarily unavailable',
|
||||
'there were no search results for',
|
||||
'this item is currently unavailable',
|
||||
'tickets unavailable',
|
||||
'tidak dijual',
|
||||
'tidak tersedia',
|
||||
'tijdelijk uitverkocht',
|
||||
'tiket tidak tersedia',
|
||||
'tükendi',
|
||||
'unavailable nearby',
|
||||
'unavailable tickets',
|
||||
'vergriffen',
|
||||
'vorbestellen',
|
||||
'vorbestellung ist bald möglich',
|
||||
'we don\'t currently have any',
|
||||
'we couldn\'t find any products that match',
|
||||
'we do not currently have an estimate of when this product will be back in stock.',
|
||||
'we don\'t know when or if this item will be back in stock.',
|
||||
'we were not able to find a match',
|
||||
'when this arrives in stock',
|
||||
'zur zeit nicht an lager',
|
||||
'品切れ',
|
||||
'已售',
|
||||
'已售完',
|
||||
'품절'
|
||||
];
|
||||
function isItemInStock() {
|
||||
// @todo Pass these in so the same list can be used in non-JS fetchers
|
||||
const outOfStockTexts = [
|
||||
' أخبرني عندما يتوفر',
|
||||
'0 in stock',
|
||||
'actuellement indisponible',
|
||||
'agotado',
|
||||
'article épuisé',
|
||||
'artikel zurzeit vergriffen',
|
||||
'as soon as stock is available',
|
||||
'aucune offre n\'est disponible',
|
||||
'ausverkauft', // sold out
|
||||
'available for back order',
|
||||
'awaiting stock',
|
||||
'back in stock soon',
|
||||
'back-order or out of stock',
|
||||
'backordered',
|
||||
'benachrichtigt mich', // notify me
|
||||
'brak na stanie',
|
||||
'brak w magazynie',
|
||||
'coming soon',
|
||||
'currently have any tickets for this',
|
||||
'currently unavailable',
|
||||
'dieser artikel ist bald wieder verfügbar',
|
||||
'dostępne wkrótce',
|
||||
'en rupture',
|
||||
'esgotado',
|
||||
'in kürze lieferbar',
|
||||
'indisponible',
|
||||
'indisponível',
|
||||
'isn\'t in stock right now',
|
||||
'isnt in stock right now',
|
||||
'isn’t in stock right now',
|
||||
'item is no longer available',
|
||||
'let me know when it\'s available',
|
||||
'mail me when available',
|
||||
'message if back in stock',
|
||||
'mevcut değil',
|
||||
'nachricht bei',
|
||||
'nicht auf lager',
|
||||
'nicht lagernd',
|
||||
'nicht lieferbar',
|
||||
'nicht verfügbar',
|
||||
'nicht vorrätig',
|
||||
'nicht zur verfügung',
|
||||
'nie znaleziono produktów',
|
||||
'niet beschikbaar',
|
||||
'niet leverbaar',
|
||||
'niet op voorraad',
|
||||
'no disponible',
|
||||
'no featured offers available',
|
||||
'no longer in stock',
|
||||
'no tickets available',
|
||||
'non disponibile',
|
||||
'non disponible',
|
||||
'not available',
|
||||
'not currently available',
|
||||
'not in stock',
|
||||
'notify me when available',
|
||||
'notify me',
|
||||
'notify when available',
|
||||
'não disponível',
|
||||
'não estamos a aceitar encomendas',
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'plus disponible',
|
||||
'prodotto esaurito',
|
||||
'produkt niedostępny',
|
||||
'rupture',
|
||||
'sold out',
|
||||
'sold-out',
|
||||
'stok habis',
|
||||
'stok kosong',
|
||||
'stok varian ini habis',
|
||||
'stokta yok',
|
||||
'temporarily out of stock',
|
||||
'temporarily unavailable',
|
||||
'there were no search results for',
|
||||
'this item is currently unavailable',
|
||||
'tickets unavailable',
|
||||
'tidak dijual',
|
||||
'tidak tersedia',
|
||||
'tijdelijk uitverkocht',
|
||||
'tiket tidak tersedia',
|
||||
'tükendi',
|
||||
'unavailable nearby',
|
||||
'unavailable tickets',
|
||||
'vergriffen',
|
||||
'vorbestellen',
|
||||
'vorbestellung ist bald möglich',
|
||||
'we couldn\'t find any products that match',
|
||||
'we do not currently have an estimate of when this product will be back in stock.',
|
||||
'we don\'t currently have any',
|
||||
'we don\'t know when or if this item will be back in stock.',
|
||||
'we were not able to find a match',
|
||||
'when this arrives in stock',
|
||||
'when this item is available to order',
|
||||
'zur zeit nicht an lager',
|
||||
'épuisé',
|
||||
'品切れ',
|
||||
'已售',
|
||||
'已售完',
|
||||
'품절'
|
||||
];
|
||||
|
||||
|
||||
const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
|
||||
const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
|
||||
|
||||
function getElementBaseText(element) {
|
||||
// .textContent can include text from children which may give the wrong results
|
||||
// scan only immediate TEXT_NODEs, which will be a child of the element
|
||||
var text = "";
|
||||
for (var i = 0; i < element.childNodes.length; ++i)
|
||||
if (element.childNodes[i].nodeType === Node.TEXT_NODE)
|
||||
text += element.childNodes[i].textContent;
|
||||
return text.toLowerCase().trim();
|
||||
}
|
||||
function getElementBaseText(element) {
|
||||
// .textContent can include text from children which may give the wrong results
|
||||
// scan only immediate TEXT_NODEs, which will be a child of the element
|
||||
var text = "";
|
||||
for (var i = 0; i < element.childNodes.length; ++i)
|
||||
if (element.childNodes[i].nodeType === Node.TEXT_NODE)
|
||||
text += element.childNodes[i].textContent;
|
||||
return text.toLowerCase().trim();
|
||||
}
|
||||
|
||||
const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
|
||||
const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
|
||||
|
||||
// The out-of-stock or in-stock-text is generally always above-the-fold
|
||||
// and often below-the-fold is a list of related products that may or may not contain trigger text
|
||||
// so it's good to filter to just the 'above the fold' elements
|
||||
// and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
|
||||
// The out-of-stock or in-stock-text is generally always above-the-fold
|
||||
// and often below-the-fold is a list of related products that may or may not contain trigger text
|
||||
// so it's good to filter to just the 'above the fold' elements
|
||||
// and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
|
||||
|
||||
console.log("Scanning %ELEMENTS%");
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
visibleElements.push(parent);
|
||||
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
visibleElements.push(parent);
|
||||
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
window.getComputedStyle(child).display !== 'none' &&
|
||||
window.getComputedStyle(child).visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
window.getComputedStyle(child).contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
window.getComputedStyle(child).display !== 'none' &&
|
||||
window.getComputedStyle(child).visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
window.getComputedStyle(child).contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const elementsToScan = [];
|
||||
collectVisibleElements(document.body, elementsToScan);
|
||||
const elementsToScan = [];
|
||||
collectVisibleElements(document.body, elementsToScan);
|
||||
|
||||
var elementText = "";
|
||||
var elementText = "";
|
||||
|
||||
// REGEXS THAT REALLY MEAN IT'S IN STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
// REGEXS THAT REALLY MEAN IT'S IN STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
|
||||
elementText = "";
|
||||
try {
|
||||
if (element.tagName.toLowerCase() === "input") {
|
||||
elementText = element.value.toLowerCase().trim();
|
||||
} else {
|
||||
elementText = getElementBaseText(element);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e);
|
||||
}
|
||||
|
||||
if (elementText.length) {
|
||||
// try which ones could mean its in stock
|
||||
if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
|
||||
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
|
||||
return 'Possibly in stock';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
elementText = "";
|
||||
try {
|
||||
// OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
elementText = "";
|
||||
if (element.tagName.toLowerCase() === "input") {
|
||||
elementText = element.value.toLowerCase().trim();
|
||||
} else {
|
||||
elementText = getElementBaseText(element);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e);
|
||||
}
|
||||
|
||||
if (elementText.length) {
|
||||
// try which ones could mean its in stock
|
||||
if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
|
||||
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
|
||||
return 'Possibly in stock';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
|
||||
for (let i = elementsToScan.length - 1; i >= 0; i--) {
|
||||
const element = elementsToScan[i];
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
elementText = "";
|
||||
if (element.tagName.toLowerCase() === "input") {
|
||||
elementText = element.value.toLowerCase().trim();
|
||||
} else {
|
||||
elementText = getElementBaseText(element);
|
||||
}
|
||||
|
||||
if (elementText.length) {
|
||||
// and these mean its out of stock
|
||||
for (const outOfStockText of outOfStockTexts) {
|
||||
if (elementText.includes(outOfStockText)) {
|
||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
||||
return outOfStockText; // item is out of stock
|
||||
if (elementText.length) {
|
||||
// and these mean its out of stock
|
||||
for (const outOfStockText of outOfStockTexts) {
|
||||
if (elementText.includes(outOfStockText)) {
|
||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
||||
return outOfStockText; // item is out of stock
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
|
||||
return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
|
||||
}
|
||||
|
||||
console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
|
||||
return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
|
||||
}
|
||||
|
||||
// returns the element text that makes it think it's out of stock
|
||||
return isItemInStock().trim()
|
||||
|
||||
|
||||
return isItemInStock().trim()
|
||||
}
|
||||
|
||||
@@ -1,285 +1,285 @@
|
||||
// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
|
||||
// All rights reserved.
|
||||
async (options) => {
|
||||
|
||||
// @file Scrape the page looking for elements of concern (%ELEMENTS%)
|
||||
// http://matatk.agrip.org.uk/tests/position-and-width/
|
||||
// https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate
|
||||
//
|
||||
// Some pages like https://www.londonstockexchange.com/stock/NCCL/ncondezi-energy-limited/analysis
|
||||
// will automatically force a scroll somewhere, so include the position offset
|
||||
// Lets hope the position doesnt change while we iterate the bbox's, but this is better than nothing
|
||||
var scroll_y = 0;
|
||||
try {
|
||||
scroll_y = +document.documentElement.scrollTop || document.body.scrollTop
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
let visualselector_xpath_selectors = options.visualselector_xpath_selectors
|
||||
let max_height = options.max_height
|
||||
|
||||
|
||||
// Include the getXpath script directly, easier than fetching
|
||||
function getxpath(e) {
|
||||
var n = e;
|
||||
if (n && n.id) return '//*[@id="' + n.id + '"]';
|
||||
for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
|
||||
for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
|
||||
for (d = n.nextSibling; d;) {
|
||||
if (d.nodeName === n.nodeName) {
|
||||
r = !0;
|
||||
break
|
||||
}
|
||||
d = d.nextSibling
|
||||
}
|
||||
o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
|
||||
}
|
||||
return o.length ? "/" + o.reverse().join("/") : ""
|
||||
}
|
||||
|
||||
const findUpTag = (el) => {
|
||||
let r = el
|
||||
chained_css = [];
|
||||
depth = 0;
|
||||
|
||||
// Strategy 1: If it's an input, with name, and there's only one, prefer that
|
||||
if (el.name !== undefined && el.name.length) {
|
||||
var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]";
|
||||
var proposed_element = window.document.querySelectorAll(proposed);
|
||||
if (proposed_element.length) {
|
||||
if (proposed_element.length === 1) {
|
||||
return proposed;
|
||||
} else {
|
||||
// Some sites change ID but name= stays the same, we can hit it if we know the index
|
||||
// Find all the elements that match and work out the input[n]
|
||||
var n = Array.from(proposed_element).indexOf(el);
|
||||
// Return a Playwright selector for nthinput[name=zipcode]
|
||||
return proposed + " >> nth=" + n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
|
||||
while (r.parentNode) {
|
||||
if (depth === 5) {
|
||||
break;
|
||||
}
|
||||
if ('' !== r.id) {
|
||||
chained_css.unshift("#" + CSS.escape(r.id));
|
||||
final_selector = chained_css.join(' > ');
|
||||
// Be sure theres only one, some sites have multiples of the same ID tag :-(
|
||||
if (window.document.querySelectorAll(final_selector).length === 1) {
|
||||
return final_selector;
|
||||
}
|
||||
return null;
|
||||
} else {
|
||||
chained_css.unshift(r.tagName.toLowerCase());
|
||||
}
|
||||
r = r.parentNode;
|
||||
depth += 1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
|
||||
|
||||
var size_pos = [];
|
||||
// after page fetch, inject this JS
|
||||
// build a map of all elements and their positions (maybe that only include text?)
|
||||
var bbox;
|
||||
console.log("Scanning %ELEMENTS%");
|
||||
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
|
||||
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
const tagName = parent.tagName.toLowerCase();
|
||||
if ("%ELEMENTS%".split(',').includes(tagName)) {
|
||||
visibleElements.push(parent);
|
||||
}
|
||||
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
const computedStyle = window.getComputedStyle(child);
|
||||
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
computedStyle.display !== 'none' &&
|
||||
computedStyle.visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
computedStyle.contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create an array to hold the visible elements
|
||||
const visibleElementsArray = [];
|
||||
|
||||
// Call collectVisibleElements with the starting parent element
|
||||
collectVisibleElements(document.body, visibleElementsArray);
|
||||
|
||||
|
||||
visibleElementsArray.forEach(function (element) {
|
||||
|
||||
bbox = element.getBoundingClientRect();
|
||||
|
||||
// Skip really small ones, and where width or height ==0
|
||||
if (bbox['width'] * bbox['height'] < 10) {
|
||||
return
|
||||
}
|
||||
|
||||
// Don't include elements that are offset from canvas
|
||||
if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
|
||||
return
|
||||
}
|
||||
|
||||
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
|
||||
// it should not traverse when we know we can anchor off just an ID one level up etc..
|
||||
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
|
||||
|
||||
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
|
||||
xpath_result = false;
|
||||
var scroll_y = 0;
|
||||
try {
|
||||
var d = findUpTag(element);
|
||||
if (d) {
|
||||
xpath_result = d;
|
||||
}
|
||||
scroll_y = +document.documentElement.scrollTop || document.body.scrollTop
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
// You could swap it and default to getXpath and then try the smarter one
|
||||
// default back to the less intelligent one
|
||||
if (!xpath_result) {
|
||||
try {
|
||||
// I've seen on FB and eBay that this doesnt work
|
||||
// ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
|
||||
xpath_result = getxpath(element);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
return
|
||||
|
||||
// Include the getXpath script directly, easier than fetching
|
||||
function getxpath(e) {
|
||||
var n = e;
|
||||
if (n && n.id) return '//*[@id="' + n.id + '"]';
|
||||
for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
|
||||
for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
|
||||
for (d = n.nextSibling; d;) {
|
||||
if (d.nodeName === n.nodeName) {
|
||||
r = !0;
|
||||
break
|
||||
}
|
||||
d = d.nextSibling
|
||||
}
|
||||
o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
|
||||
}
|
||||
return o.length ? "/" + o.reverse().join("/") : ""
|
||||
}
|
||||
|
||||
const findUpTag = (el) => {
|
||||
let r = el
|
||||
chained_css = [];
|
||||
depth = 0;
|
||||
|
||||
// Strategy 1: If it's an input, with name, and there's only one, prefer that
|
||||
if (el.name !== undefined && el.name.length) {
|
||||
var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]";
|
||||
var proposed_element = window.document.querySelectorAll(proposed);
|
||||
if (proposed_element.length) {
|
||||
if (proposed_element.length === 1) {
|
||||
return proposed;
|
||||
} else {
|
||||
// Some sites change ID but name= stays the same, we can hit it if we know the index
|
||||
// Find all the elements that match and work out the input[n]
|
||||
var n = Array.from(proposed_element).indexOf(el);
|
||||
// Return a Playwright selector for nthinput[name=zipcode]
|
||||
return proposed + " >> nth=" + n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
|
||||
while (r.parentNode) {
|
||||
if (depth === 5) {
|
||||
break;
|
||||
}
|
||||
if ('' !== r.id) {
|
||||
chained_css.unshift("#" + CSS.escape(r.id));
|
||||
final_selector = chained_css.join(' > ');
|
||||
// Be sure theres only one, some sites have multiples of the same ID tag :-(
|
||||
if (window.document.querySelectorAll(final_selector).length === 1) {
|
||||
return final_selector;
|
||||
}
|
||||
return null;
|
||||
} else {
|
||||
chained_css.unshift(r.tagName.toLowerCase());
|
||||
}
|
||||
r = r.parentNode;
|
||||
depth += 1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
|
||||
var size_pos = [];
|
||||
// after page fetch, inject this JS
|
||||
// build a map of all elements and their positions (maybe that only include text?)
|
||||
var bbox;
|
||||
console.log(`Scanning for "${visualselector_xpath_selectors}"`);
|
||||
|
||||
function collectVisibleElements(parent, visibleElements) {
|
||||
if (!parent) return; // Base case: if parent is null or undefined, return
|
||||
|
||||
|
||||
// Add the parent itself to the visible elements array if it's of the specified types
|
||||
const tagName = parent.tagName.toLowerCase();
|
||||
if (visualselector_xpath_selectors.split(',').includes(tagName)) {
|
||||
visibleElements.push(parent);
|
||||
}
|
||||
|
||||
// Iterate over the parent's children
|
||||
const children = parent.children;
|
||||
for (let i = 0; i < children.length; i++) {
|
||||
const child = children[i];
|
||||
const computedStyle = window.getComputedStyle(child);
|
||||
|
||||
if (
|
||||
child.nodeType === Node.ELEMENT_NODE &&
|
||||
computedStyle.display !== 'none' &&
|
||||
computedStyle.visibility !== 'hidden' &&
|
||||
child.offsetWidth >= 0 &&
|
||||
child.offsetHeight >= 0 &&
|
||||
computedStyle.contentVisibility !== 'hidden'
|
||||
) {
|
||||
// If the child is an element and is visible, recursively collect visible elements
|
||||
collectVisibleElements(child, visibleElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
|
||||
// Create an array to hold the visible elements
|
||||
const visibleElementsArray = [];
|
||||
|
||||
let text = element.textContent.trim().slice(0, 30).trim();
|
||||
while (/\n{2,}|\t{2,}/.test(text)) {
|
||||
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
|
||||
}
|
||||
// Call collectVisibleElements with the starting parent element
|
||||
collectVisibleElements(document.body, visibleElementsArray);
|
||||
|
||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
|
||||
const computedStyle = window.getComputedStyle(element);
|
||||
|
||||
size_pos.push({
|
||||
xpath: xpath_result,
|
||||
width: Math.round(bbox['width']),
|
||||
height: Math.round(bbox['height']),
|
||||
left: Math.floor(bbox['left']),
|
||||
top: Math.floor(bbox['top']) + scroll_y,
|
||||
// tagName used by Browser Steps
|
||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||
// tagtype used by Browser Steps
|
||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||
isClickable: computedStyle.cursor === "pointer",
|
||||
// Used by the keras trainer
|
||||
fontSize: computedStyle.getPropertyValue('font-size'),
|
||||
fontWeight: computedStyle.getPropertyValue('font-weight'),
|
||||
hasDigitCurrency: hasDigitCurrency,
|
||||
label: label,
|
||||
visibleElementsArray.forEach(function (element) {
|
||||
|
||||
bbox = element.getBoundingClientRect();
|
||||
|
||||
// Skip really small ones, and where width or height ==0
|
||||
if (bbox['width'] * bbox['height'] < 10) {
|
||||
return
|
||||
}
|
||||
|
||||
// Don't include elements that are offset from canvas
|
||||
if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
|
||||
return
|
||||
}
|
||||
|
||||
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
|
||||
// it should not traverse when we know we can anchor off just an ID one level up etc..
|
||||
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
|
||||
|
||||
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
|
||||
xpath_result = false;
|
||||
try {
|
||||
var d = findUpTag(element);
|
||||
if (d) {
|
||||
xpath_result = d;
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
// You could swap it and default to getXpath and then try the smarter one
|
||||
// default back to the less intelligent one
|
||||
if (!xpath_result) {
|
||||
try {
|
||||
// I've seen on FB and eBay that this doesnt work
|
||||
// ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
|
||||
xpath_result = getxpath(element);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
|
||||
|
||||
let text = element.textContent.trim().slice(0, 30).trim();
|
||||
while (/\n{2,}|\t{2,}/.test(text)) {
|
||||
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
|
||||
}
|
||||
|
||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6))) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text);
|
||||
const computedStyle = window.getComputedStyle(element);
|
||||
|
||||
if (Math.floor(bbox['top']) + scroll_y > max_height) {
|
||||
return
|
||||
}
|
||||
|
||||
size_pos.push({
|
||||
xpath: xpath_result,
|
||||
width: Math.round(bbox['width']),
|
||||
height: Math.round(bbox['height']),
|
||||
left: Math.floor(bbox['left']),
|
||||
top: Math.floor(bbox['top']) + scroll_y,
|
||||
// tagName used by Browser Steps
|
||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||
// tagtype used by Browser Steps
|
||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||
isClickable: computedStyle.cursor === "pointer",
|
||||
// Used by the keras trainer
|
||||
fontSize: computedStyle.getPropertyValue('font-size'),
|
||||
fontWeight: computedStyle.getPropertyValue('font-weight'),
|
||||
hasDigitCurrency: hasDigitCurrency,
|
||||
label: label,
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
|
||||
// Inject the current one set in the include_filters, which may be a CSS rule
|
||||
// used for displaying the current one in VisualSelector, where its not one we generated.
|
||||
if (include_filters.length) {
|
||||
let results;
|
||||
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
|
||||
for (const f of include_filters) {
|
||||
bbox = false;
|
||||
q = false;
|
||||
if (include_filters.length) {
|
||||
let results;
|
||||
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
|
||||
for (const f of include_filters) {
|
||||
bbox = false;
|
||||
q = false;
|
||||
|
||||
if (!f.length) {
|
||||
console.log("xpath_element_scraper: Empty filter, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// is it xpath?
|
||||
if (f.startsWith('/') || f.startsWith('xpath')) {
|
||||
var qry_f = f.replace(/xpath(:|\d:)/, '')
|
||||
console.log("[xpath] Scanning for included filter " + qry_f)
|
||||
let xpathResult = document.evaluate(qry_f, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
||||
results = [];
|
||||
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
||||
results.push(xpathResult.snapshotItem(i));
|
||||
}
|
||||
} else {
|
||||
console.log("[css] Scanning for included filter " + f)
|
||||
console.log("[css] Scanning for included filter " + f);
|
||||
results = document.querySelectorAll(f);
|
||||
if (!f.length) {
|
||||
console.log("xpath_element_scraper: Empty filter, skipping");
|
||||
continue;
|
||||
}
|
||||
} catch (e) {
|
||||
// Maybe catch DOMException and alert?
|
||||
console.log("xpath_element_scraper: Exception selecting element from filter " + f);
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
if (results != null && results.length) {
|
||||
|
||||
// Iterate over the results
|
||||
results.forEach(node => {
|
||||
// Try to resolve //something/text() back to its /something so we can atleast get the bounding box
|
||||
try {
|
||||
if (typeof node.nodeName == 'string' && node.nodeName === '#text') {
|
||||
node = node.parentElement
|
||||
try {
|
||||
// is it xpath?
|
||||
if (f.startsWith('/') || f.startsWith('xpath')) {
|
||||
var qry_f = f.replace(/xpath(:|\d:)/, '')
|
||||
console.log("[xpath] Scanning for included filter " + qry_f)
|
||||
let xpathResult = document.evaluate(qry_f, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
||||
results = [];
|
||||
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
||||
results.push(xpathResult.snapshotItem(i));
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
console.log("xpath_element_scraper: #text resolver")
|
||||
}
|
||||
|
||||
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
|
||||
if (typeof node.getBoundingClientRect == 'function') {
|
||||
bbox = node.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
|
||||
} else {
|
||||
console.log("[css] Scanning for included filter " + f)
|
||||
console.log("[css] Scanning for included filter " + f);
|
||||
results = document.querySelectorAll(f);
|
||||
}
|
||||
} catch (e) {
|
||||
// Maybe catch DOMException and alert?
|
||||
console.log("xpath_element_scraper: Exception selecting element from filter " + f);
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
if (results != null && results.length) {
|
||||
|
||||
// Iterate over the results
|
||||
results.forEach(node => {
|
||||
// Try to resolve //something/text() back to its /something so we can atleast get the bounding box
|
||||
try {
|
||||
// Try and see we can find its ownerElement
|
||||
bbox = node.ownerElement.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
|
||||
if (typeof node.nodeName == 'string' && node.nodeName === '#text') {
|
||||
node = node.parentElement
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
console.log("xpath_element_scraper: error looking up q.ownerElement")
|
||||
console.log("xpath_element_scraper: #text resolver")
|
||||
}
|
||||
}
|
||||
|
||||
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
|
||||
size_pos.push({
|
||||
xpath: f,
|
||||
width: parseInt(bbox['width']),
|
||||
height: parseInt(bbox['height']),
|
||||
left: parseInt(bbox['left']),
|
||||
top: parseInt(bbox['top']) + scroll_y,
|
||||
highlight_as_custom_filter: true
|
||||
});
|
||||
}
|
||||
});
|
||||
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
|
||||
if (typeof node.getBoundingClientRect == 'function') {
|
||||
bbox = node.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
|
||||
} else {
|
||||
try {
|
||||
// Try and see we can find its ownerElement
|
||||
bbox = node.ownerElement.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
console.log("xpath_element_scraper: error looking up q.ownerElement")
|
||||
}
|
||||
}
|
||||
|
||||
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
|
||||
size_pos.push({
|
||||
xpath: f,
|
||||
width: parseInt(bbox['width']),
|
||||
height: parseInt(bbox['height']),
|
||||
left: parseInt(bbox['left']),
|
||||
top: parseInt(bbox['top']) + scroll_y,
|
||||
highlight_as_custom_filter: true
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
|
||||
// so that we dont select the wrapping element by mistake and be unable to select what we want
|
||||
size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
|
||||
size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
|
||||
|
||||
// browser_width required for proper scaling in the frontend
|
||||
// Return as a string to save playwright for juggling thousands of objects
|
||||
return JSON.stringify({'size_pos': size_pos, 'browser_width': window.innerWidth});
|
||||
}
|
||||
|
||||
// Window.width required for proper scaling in the frontend
|
||||
return {'size_pos': size_pos, 'browser_width': window.innerWidth};
|
||||
|
||||
73
changedetectionio/content_fetchers/screenshot_handler.py
Normal file
73
changedetectionio/content_fetchers/screenshot_handler.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# Pages with a vertical height longer than this will use the 'stitch together' method.
|
||||
|
||||
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
|
||||
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
|
||||
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, SCREENSHOT_DEFAULT_QUALITY
|
||||
|
||||
|
||||
def stitch_images_worker(pipe_conn, chunks_bytes, original_page_height, capture_height):
|
||||
import os
|
||||
import io
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
try:
|
||||
|
||||
# Load images from byte chunks
|
||||
images = [Image.open(io.BytesIO(b)) for b in chunks_bytes]
|
||||
total_height = sum(im.height for im in images)
|
||||
max_width = max(im.width for im in images)
|
||||
|
||||
# Create stitched image
|
||||
stitched = Image.new('RGB', (max_width, total_height))
|
||||
y_offset = 0
|
||||
for im in images:
|
||||
stitched.paste(im, (0, y_offset))
|
||||
y_offset += im.height
|
||||
|
||||
# Draw caption on top (overlaid, not extending canvas)
|
||||
draw = ImageDraw.Draw(stitched)
|
||||
|
||||
if original_page_height > capture_height:
|
||||
caption_text = f"WARNING: Screenshot was {original_page_height}px but trimmed to {capture_height}px because it was too long"
|
||||
padding = 10
|
||||
font_size = 35
|
||||
font_color = (255, 0, 0)
|
||||
background_color = (255, 255, 255)
|
||||
|
||||
|
||||
# Try to load a proper font
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", font_size)
|
||||
except IOError:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
bbox = draw.textbbox((0, 0), caption_text, font=font)
|
||||
text_width = bbox[2] - bbox[0]
|
||||
text_height = bbox[3] - bbox[1]
|
||||
|
||||
# Draw white rectangle background behind text
|
||||
rect_top = 0
|
||||
rect_bottom = text_height + 2 * padding
|
||||
draw.rectangle([(0, rect_top), (max_width, rect_bottom)], fill=background_color)
|
||||
|
||||
# Draw text centered horizontally, 10px padding from top of the rectangle
|
||||
text_x = (max_width - text_width) // 2
|
||||
text_y = padding
|
||||
draw.text((text_x, text_y), caption_text, font=font, fill=font_color)
|
||||
|
||||
# Encode and send image
|
||||
output = io.BytesIO()
|
||||
stitched.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)))
|
||||
pipe_conn.send_bytes(output.getvalue())
|
||||
|
||||
stitched.close()
|
||||
except Exception as e:
|
||||
pipe_conn.send(f"error:{e}")
|
||||
finally:
|
||||
pipe_conn.close()
|
||||
|
||||
|
||||
@@ -65,6 +65,17 @@ class fetcher(Fetcher):
|
||||
# request_body, request_method unused for now, until some magic in the future happens.
|
||||
|
||||
options = ChromeOptions()
|
||||
|
||||
# Load Chrome options from env
|
||||
CHROME_OPTIONS = [
|
||||
line.strip()
|
||||
for line in os.getenv("CHROME_OPTIONS", "").strip().splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
for opt in CHROME_OPTIONS:
|
||||
options.add_argument(opt)
|
||||
|
||||
if self.proxy:
|
||||
options.proxy = self.proxy
|
||||
|
||||
@@ -79,7 +90,9 @@ class fetcher(Fetcher):
|
||||
self.quit()
|
||||
raise
|
||||
|
||||
self.driver.set_window_size(1280, 1024)
|
||||
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
|
||||
self.driver.set_window_size(1280, 1024)
|
||||
|
||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
|
||||
if self.webdriver_js_execute_code is not None:
|
||||
@@ -87,6 +100,7 @@ class fetcher(Fetcher):
|
||||
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
|
||||
|
||||
# @todo - how to check this? is it possible?
|
||||
self.status_code = 200
|
||||
# @todo somehow we should try to get this working for WebDriver
|
||||
@@ -112,9 +126,9 @@ class fetcher(Fetcher):
|
||||
self.quit()
|
||||
return True
|
||||
|
||||
def quit(self):
|
||||
def quit(self, watch=None):
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception as e:
|
||||
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
|
||||
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
|
||||
|
||||
@@ -33,7 +33,7 @@ from loguru import logger
|
||||
|
||||
from changedetectionio import __version__
|
||||
from changedetectionio import queuedWatchMetaData
|
||||
from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags
|
||||
from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags, Notifications
|
||||
from changedetectionio.api.Search import Search
|
||||
from .time_handler import is_within_schedule
|
||||
|
||||
@@ -285,7 +285,8 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
watch_api.add_resource(Search, '/api/v1/search',
|
||||
resource_class_kwargs={'datastore': datastore})
|
||||
|
||||
|
||||
watch_api.add_resource(Notifications, '/api/v1/notifications',
|
||||
resource_class_kwargs={'datastore': datastore})
|
||||
|
||||
@login_manager.user_loader
|
||||
def user_loader(email):
|
||||
@@ -394,7 +395,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
response.headers['Content-Type'] = 'application/json'
|
||||
response.headers['Content-Encoding'] = 'deflate'
|
||||
else:
|
||||
logger.error(f'Request elements.deflate at "{watch_directory}" but was notfound.')
|
||||
logger.error(f'Request elements.deflate at "{watch_directory}" but was not found.')
|
||||
abort(404)
|
||||
|
||||
if response:
|
||||
@@ -514,7 +515,8 @@ def notification_runner():
|
||||
sent_obj = None
|
||||
|
||||
try:
|
||||
from changedetectionio import notification
|
||||
from changedetectionio.notification.handler import process_notification
|
||||
|
||||
# Fallback to system config if not set
|
||||
if not n_object.get('notification_body') and datastore.data['settings']['application'].get('notification_body'):
|
||||
n_object['notification_body'] = datastore.data['settings']['application'].get('notification_body')
|
||||
@@ -524,8 +526,8 @@ def notification_runner():
|
||||
|
||||
if not n_object.get('notification_format') and datastore.data['settings']['application'].get('notification_format'):
|
||||
n_object['notification_format'] = datastore.data['settings']['application'].get('notification_format')
|
||||
|
||||
sent_obj = notification.process_notification(n_object, datastore)
|
||||
if n_object.get('notification_urls', {}):
|
||||
sent_obj = process_notification(n_object, datastore)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Watch URL: {n_object['watch_url']} Error {str(e)}")
|
||||
|
||||
@@ -306,8 +306,8 @@ class ValidateAppRiseServers(object):
|
||||
|
||||
def __call__(self, form, field):
|
||||
import apprise
|
||||
from .apprise_plugin.assets import apprise_asset
|
||||
from .apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
|
||||
from .notification.apprise_plugin.assets import apprise_asset
|
||||
from .notification.apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
|
||||
|
||||
apobj = apprise.Apprise(asset=apprise_asset)
|
||||
|
||||
@@ -586,7 +586,7 @@ class processor_text_json_diff_form(commonSettingsForm):
|
||||
filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
|
||||
filter_text_removed = BooleanField('Removed lines', default=True)
|
||||
|
||||
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
|
||||
trigger_text = StringListField('Keyword triggers - Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
|
||||
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
|
||||
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
|
||||
text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()])
|
||||
@@ -721,6 +721,8 @@ class globalSettingsRequestForm(Form):
|
||||
self.extra_proxies.errors.append('Both a name, and a Proxy URL is required.')
|
||||
return False
|
||||
|
||||
class globalSettingsApplicationUIForm(Form):
|
||||
open_diff_in_new_tab = BooleanField('Open diff page in a new tab', default=True, validators=[validators.Optional()])
|
||||
|
||||
# datastore.data['settings']['application']..
|
||||
class globalSettingsApplicationForm(commonSettingsForm):
|
||||
@@ -752,6 +754,7 @@ class globalSettingsApplicationForm(commonSettingsForm):
|
||||
render_kw={"style": "width: 5em;"},
|
||||
validators=[validators.NumberRange(min=0,
|
||||
message="Should contain zero or more attempts")])
|
||||
ui = FormField(globalSettingsApplicationUIForm)
|
||||
|
||||
|
||||
class globalSettingsForm(Form):
|
||||
|
||||
@@ -366,22 +366,41 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
|
||||
# wordlist - list of regex's (str) or words (str)
|
||||
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
|
||||
def strip_ignore_text(content, wordlist, mode="content"):
|
||||
i = 0
|
||||
output = []
|
||||
ignore_text = []
|
||||
ignore_regex = []
|
||||
ignored_line_numbers = []
|
||||
ignore_regex_multiline = []
|
||||
ignored_lines = []
|
||||
|
||||
for k in wordlist:
|
||||
# Is it a regex?
|
||||
res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
|
||||
if res:
|
||||
ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
|
||||
res = re.compile(perl_style_slash_enclosed_regex_to_options(k))
|
||||
if res.flags & re.DOTALL or res.flags & re.MULTILINE:
|
||||
ignore_regex_multiline.append(res)
|
||||
else:
|
||||
ignore_regex.append(res)
|
||||
else:
|
||||
ignore_text.append(k.strip())
|
||||
|
||||
for line in content.splitlines(keepends=True):
|
||||
i += 1
|
||||
for r in ignore_regex_multiline:
|
||||
for match in r.finditer(content):
|
||||
content_lines = content[:match.end()].splitlines(keepends=True)
|
||||
match_lines = content[match.start():match.end()].splitlines(keepends=True)
|
||||
|
||||
end_line = len(content_lines)
|
||||
start_line = end_line - len(match_lines)
|
||||
|
||||
if end_line - start_line <= 1:
|
||||
# Match is empty or in the middle of the line
|
||||
ignored_lines.append(start_line)
|
||||
else:
|
||||
for i in range(start_line, end_line):
|
||||
ignored_lines.append(i)
|
||||
|
||||
line_index = 0
|
||||
lines = content.splitlines(keepends=True)
|
||||
for line in lines:
|
||||
# Always ignore blank lines in this mode. (when this function gets called)
|
||||
got_match = False
|
||||
for l in ignore_text:
|
||||
@@ -393,17 +412,19 @@ def strip_ignore_text(content, wordlist, mode="content"):
|
||||
if r.search(line):
|
||||
got_match = True
|
||||
|
||||
if not got_match:
|
||||
# Not ignored, and should preserve "keepends"
|
||||
output.append(line)
|
||||
else:
|
||||
ignored_line_numbers.append(i)
|
||||
if got_match:
|
||||
ignored_lines.append(line_index)
|
||||
|
||||
line_index += 1
|
||||
|
||||
ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)])
|
||||
|
||||
# Used for finding out what to highlight
|
||||
if mode == "line numbers":
|
||||
return ignored_line_numbers
|
||||
return [i + 1 for i in ignored_lines]
|
||||
|
||||
return ''.join(output)
|
||||
output_lines = set(range(len(lines))) - ignored_lines
|
||||
return ''.join([lines[i] for i in output_lines])
|
||||
|
||||
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
||||
from xml.sax.saxutils import escape as xml_escape
|
||||
@@ -414,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
||||
|
||||
return re.sub(pattern, repl, html_content)
|
||||
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
|
||||
|
||||
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
|
||||
from inscriptis import get_text
|
||||
from inscriptis.model.config import ParserConfig
|
||||
|
||||
@@ -449,15 +472,27 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||
|
||||
text_content = get_text(html_content, config=parser_config)
|
||||
conn.send(text_content)
|
||||
conn.close()
|
||||
|
||||
return text_content
|
||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
||||
p.start()
|
||||
text = parent_conn.recv()
|
||||
p.join()
|
||||
return text
|
||||
|
||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||
def has_ldjson_product_info(content):
|
||||
try:
|
||||
lc = content.lower()
|
||||
if 'application/ld+json' in lc and lc.count('"price"') == 1 and '"pricecurrency"' in lc:
|
||||
# Better than .lower() which can use a lot of ram
|
||||
if (re.search(r'application/ld\+json', content, re.IGNORECASE) and
|
||||
re.search(r'"price"', content, re.IGNORECASE) and
|
||||
re.search(r'"pricecurrency"', content, re.IGNORECASE)):
|
||||
return True
|
||||
|
||||
# On some pages this is really terribly expensive when they dont really need it
|
||||
|
||||
@@ -60,6 +60,9 @@ class model(dict):
|
||||
'webdriver_delay': None , # Extra delay in seconds before extracting text
|
||||
'tags': {}, #@todo use Tag.model initialisers
|
||||
'timezone': None, # Default IANA timezone name
|
||||
'ui': {
|
||||
'open_diff_in_new_tab': True,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -553,7 +553,10 @@ class model(watch_base):
|
||||
self.ensure_data_dir_exists()
|
||||
|
||||
with open(target_path, 'wb') as f:
|
||||
f.write(zlib.compress(json.dumps(data).encode()))
|
||||
if not isinstance(data, str):
|
||||
f.write(zlib.compress(json.dumps(data).encode()))
|
||||
else:
|
||||
f.write(zlib.compress(data.encode()))
|
||||
f.close()
|
||||
|
||||
# Save as PNG, PNG is larger but better for doing visual diff in the future
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import uuid
|
||||
|
||||
from changedetectionio import strtobool
|
||||
from changedetectionio.notification import default_notification_format_for_watch
|
||||
default_notification_format_for_watch = 'System default'
|
||||
|
||||
class watch_base(dict):
|
||||
|
||||
|
||||
35
changedetectionio/notification/__init__.py
Normal file
35
changedetectionio/notification/__init__.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from changedetectionio.model import default_notification_format_for_watch
|
||||
|
||||
ult_notification_format_for_watch = 'System default'
|
||||
default_notification_format = 'HTML Color'
|
||||
default_notification_body = '{{watch_url}} had a change.\n---\n{{diff}}\n---\n'
|
||||
default_notification_title = 'ChangeDetection.io Notification - {{watch_url}}'
|
||||
|
||||
# The values (markdown etc) are from apprise NotifyFormat,
|
||||
# But to avoid importing the whole heavy module just use the same strings here.
|
||||
valid_notification_formats = {
|
||||
'Text': 'text',
|
||||
'Markdown': 'markdown',
|
||||
'HTML': 'html',
|
||||
'HTML Color': 'htmlcolor',
|
||||
# Used only for editing a watch (not for global)
|
||||
default_notification_format_for_watch: default_notification_format_for_watch
|
||||
}
|
||||
|
||||
|
||||
valid_tokens = {
|
||||
'base_url': '',
|
||||
'current_snapshot': '',
|
||||
'diff': '',
|
||||
'diff_added': '',
|
||||
'diff_full': '',
|
||||
'diff_patch': '',
|
||||
'diff_removed': '',
|
||||
'diff_url': '',
|
||||
'preview_url': '',
|
||||
'triggered_text': '',
|
||||
'watch_tag': '',
|
||||
'watch_title': '',
|
||||
'watch_url': '',
|
||||
'watch_uuid': '',
|
||||
}
|
||||
@@ -1,47 +1,17 @@
|
||||
|
||||
import time
|
||||
from apprise import NotifyFormat
|
||||
import apprise
|
||||
from loguru import logger
|
||||
|
||||
from .apprise_plugin.assets import APPRISE_AVATAR_URL
|
||||
from .apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
|
||||
from .safe_jinja import render as jinja_render
|
||||
|
||||
valid_tokens = {
|
||||
'base_url': '',
|
||||
'current_snapshot': '',
|
||||
'diff': '',
|
||||
'diff_added': '',
|
||||
'diff_full': '',
|
||||
'diff_patch': '',
|
||||
'diff_removed': '',
|
||||
'diff_url': '',
|
||||
'preview_url': '',
|
||||
'triggered_text': '',
|
||||
'watch_tag': '',
|
||||
'watch_title': '',
|
||||
'watch_url': '',
|
||||
'watch_uuid': '',
|
||||
}
|
||||
|
||||
default_notification_format_for_watch = 'System default'
|
||||
default_notification_format = 'HTML Color'
|
||||
default_notification_body = '{{watch_url}} had a change.\n---\n{{diff}}\n---\n'
|
||||
default_notification_title = 'ChangeDetection.io Notification - {{watch_url}}'
|
||||
|
||||
valid_notification_formats = {
|
||||
'Text': NotifyFormat.TEXT,
|
||||
'Markdown': NotifyFormat.MARKDOWN,
|
||||
'HTML': NotifyFormat.HTML,
|
||||
'HTML Color': 'htmlcolor',
|
||||
# Used only for editing a watch (not for global)
|
||||
default_notification_format_for_watch: default_notification_format_for_watch
|
||||
}
|
||||
|
||||
from .apprise_plugin.assets import apprise_asset, APPRISE_AVATAR_URL
|
||||
|
||||
|
||||
def process_notification(n_object, datastore):
|
||||
from changedetectionio.safe_jinja import render as jinja_render
|
||||
from . import default_notification_format_for_watch, default_notification_format, valid_notification_formats
|
||||
# be sure its registered
|
||||
from .apprise_plugin.custom_handlers import apprise_http_custom_handler
|
||||
|
||||
now = time.time()
|
||||
if n_object.get('notification_timestamp'):
|
||||
logger.trace(f"Time since queued {now-n_object['notification_timestamp']:.3f}s")
|
||||
@@ -58,14 +28,13 @@ def process_notification(n_object, datastore):
|
||||
# Initially text or whatever
|
||||
n_format = datastore.data['settings']['application'].get('notification_format', valid_notification_formats[default_notification_format])
|
||||
|
||||
logger.trace(f"Complete notification body including Jinja and placeholders calculated in {time.time() - now:.3f}s")
|
||||
logger.trace(f"Complete notification body including Jinja and placeholders calculated in {time.time() - now:.2f}s")
|
||||
|
||||
# https://github.com/caronc/apprise/wiki/Development_LogCapture
|
||||
# Anything higher than or equal to WARNING (which covers things like Connection errors)
|
||||
# raise it as an exception
|
||||
|
||||
sent_objs = []
|
||||
from .apprise_plugin.assets import apprise_asset
|
||||
|
||||
if 'as_async' in n_object:
|
||||
apprise_asset.async_mode = n_object.get('as_async')
|
||||
@@ -176,6 +145,7 @@ def process_notification(n_object, datastore):
|
||||
# ( Where we prepare the tokens in the notification to be replaced with actual values )
|
||||
def create_notification_parameters(n_object, datastore):
|
||||
from copy import deepcopy
|
||||
from . import valid_tokens
|
||||
|
||||
# in the case we send a test notification from the main settings, there is no UUID.
|
||||
uuid = n_object['uuid'] if 'uuid' in n_object else ''
|
||||
82
changedetectionio/pluggy_interface.py
Normal file
82
changedetectionio/pluggy_interface.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import pluggy
|
||||
import os
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
# Global plugin namespace for changedetection.io
|
||||
PLUGIN_NAMESPACE = "changedetectionio"
|
||||
|
||||
hookspec = pluggy.HookspecMarker(PLUGIN_NAMESPACE)
|
||||
hookimpl = pluggy.HookimplMarker(PLUGIN_NAMESPACE)
|
||||
|
||||
|
||||
class ChangeDetectionSpec:
|
||||
"""Hook specifications for extending changedetection.io functionality."""
|
||||
|
||||
@hookspec
|
||||
def ui_edit_stats_extras(watch):
|
||||
"""Return HTML content to add to the stats tab in the edit view.
|
||||
|
||||
Args:
|
||||
watch: The watch object being edited
|
||||
|
||||
Returns:
|
||||
str: HTML content to be inserted in the stats tab
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
# Set up Plugin Manager
|
||||
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
|
||||
|
||||
# Register hookspecs
|
||||
plugin_manager.add_hookspecs(ChangeDetectionSpec)
|
||||
|
||||
# Load plugins from subdirectories
|
||||
def load_plugins_from_directories():
|
||||
# Dictionary of directories to scan for plugins
|
||||
plugin_dirs = {
|
||||
'conditions': os.path.join(os.path.dirname(__file__), 'conditions', 'plugins'),
|
||||
# Add more plugin directories here as needed
|
||||
}
|
||||
|
||||
# Note: Removed the direct import of example_word_count_plugin as it's now in the conditions/plugins directory
|
||||
|
||||
for dir_name, dir_path in plugin_dirs.items():
|
||||
if not os.path.exists(dir_path):
|
||||
continue
|
||||
|
||||
# Get all Python files (excluding __init__.py)
|
||||
for filename in os.listdir(dir_path):
|
||||
if filename.endswith(".py") and filename != "__init__.py":
|
||||
module_name = filename[:-3] # Remove .py extension
|
||||
module_path = f"changedetectionio.{dir_name}.plugins.{module_name}"
|
||||
|
||||
try:
|
||||
module = importlib.import_module(module_path)
|
||||
# Register the plugin with pluggy
|
||||
plugin_manager.register(module, module_name)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Error loading plugin {module_name}: {e}")
|
||||
|
||||
# Load plugins
|
||||
load_plugins_from_directories()
|
||||
|
||||
# Discover installed plugins from external packages (if any)
|
||||
plugin_manager.load_setuptools_entrypoints(PLUGIN_NAMESPACE)
|
||||
|
||||
# Helper function to collect UI stats extras from all plugins
|
||||
def collect_ui_edit_stats_extras(watch):
|
||||
"""Collect and combine HTML content from all plugins that implement ui_edit_stats_extras"""
|
||||
extras_content = []
|
||||
|
||||
# Get all plugins that implement the ui_edit_stats_extras hook
|
||||
results = plugin_manager.hook.ui_edit_stats_extras(watch=watch)
|
||||
|
||||
# If we have results, add them to our content
|
||||
if results:
|
||||
for result in results:
|
||||
if result: # Skip empty results
|
||||
extras_content.append(result)
|
||||
|
||||
return "\n".join(extras_content) if extras_content else ""
|
||||
@@ -159,7 +159,7 @@ class difference_detection_processor():
|
||||
)
|
||||
|
||||
#@todo .quit here could go on close object, so we can run JS if change-detected
|
||||
self.fetcher.quit()
|
||||
self.fetcher.quit(watch=self.watch)
|
||||
|
||||
# After init, call run_changedetection() which will do the actual change-detection
|
||||
|
||||
|
||||
@@ -252,6 +252,7 @@ class perform_site_check(difference_detection_processor):
|
||||
|
||||
# 615 Extract text by regex
|
||||
extract_text = watch.get('extract_text', [])
|
||||
extract_text += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='extract_text')
|
||||
if len(extract_text) > 0:
|
||||
regex_matched_output = []
|
||||
for s_re in extract_text:
|
||||
@@ -296,6 +297,8 @@ class perform_site_check(difference_detection_processor):
|
||||
### CALCULATE MD5
|
||||
# If there's text to ignore
|
||||
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
|
||||
text_to_ignore += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='ignore_text')
|
||||
|
||||
text_for_checksuming = stripped_text_from_html
|
||||
if text_to_ignore:
|
||||
text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
|
||||
@@ -308,8 +311,8 @@ class perform_site_check(difference_detection_processor):
|
||||
|
||||
############ Blocking rules, after checksum #################
|
||||
blocked = False
|
||||
|
||||
trigger_text = watch.get('trigger_text', [])
|
||||
trigger_text += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='trigger_text')
|
||||
if len(trigger_text):
|
||||
# Assume blocked
|
||||
blocked = True
|
||||
@@ -324,6 +327,7 @@ class perform_site_check(difference_detection_processor):
|
||||
blocked = False
|
||||
|
||||
text_should_not_be_present = watch.get('text_should_not_be_present', [])
|
||||
text_should_not_be_present += self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='text_should_not_be_present')
|
||||
if len(text_should_not_be_present):
|
||||
# If anything matched, then we should block a change from happening
|
||||
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
|
||||
|
||||
@@ -14,7 +14,8 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
find tests/test_*py -type f|while read test_name
|
||||
do
|
||||
echo "TEST RUNNING $test_name"
|
||||
pytest $test_name
|
||||
# REMOVE_REQUESTS_OLD_SCREENSHOTS disabled so that we can write a screenshot and send it in test_notifications.py without a real browser
|
||||
REMOVE_REQUESTS_OLD_SCREENSHOTS=false pytest $test_name
|
||||
done
|
||||
|
||||
echo "RUNNING WITH BASE_URL SET"
|
||||
@@ -22,7 +23,7 @@ echo "RUNNING WITH BASE_URL SET"
|
||||
# Now re-run some tests with BASE_URL enabled
|
||||
# Re #65 - Ability to include a link back to the installation, in the notification.
|
||||
export BASE_URL="https://really-unique-domain.io"
|
||||
pytest tests/test_notification.py
|
||||
REMOVE_REQUESTS_OLD_SCREENSHOTS=false pytest tests/test_notification.py
|
||||
|
||||
|
||||
# Re-run with HIDE_REFERER set - could affect login
|
||||
@@ -32,7 +33,7 @@ pytest tests/test_access_control.py
|
||||
# Re-run a few tests that will trigger brotli based storage
|
||||
export SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD=5
|
||||
pytest tests/test_access_control.py
|
||||
pytest tests/test_notification.py
|
||||
REMOVE_REQUESTS_OLD_SCREENSHOTS=false pytest tests/test_notification.py
|
||||
pytest tests/test_backend.py
|
||||
pytest tests/test_rss.py
|
||||
pytest tests/test_unique_lines.py
|
||||
|
||||
@@ -211,7 +211,14 @@ $(document).ready(function () {
|
||||
$('input[type=text]', first_available).first().val(x['xpath']);
|
||||
$('input[placeholder="Value"]', first_available).addClass('ok').click().focus();
|
||||
found_something = true;
|
||||
} else {
|
||||
}
|
||||
else if (x['tagName'] === 'select') {
|
||||
$('select', first_available).val('<select> by option text').change();
|
||||
$('input[type=text]', first_available).first().val(x['xpath']);
|
||||
$('input[placeholder="Value"]', first_available).addClass('ok').click().focus();
|
||||
found_something = true;
|
||||
}
|
||||
else {
|
||||
// There's no good way (that I know) to find if this
|
||||
// see https://stackoverflow.com/questions/446892/how-to-find-event-listeners-on-a-dom-node-in-javascript-or-in-debugging
|
||||
// https://codepen.io/azaslavsky/pen/DEJVWv
|
||||
@@ -251,6 +258,10 @@ $(document).ready(function () {
|
||||
400: function () {
|
||||
// More than likely the CSRF token was lost when the server restarted
|
||||
alert("There was a problem processing the request, please reload the page.");
|
||||
},
|
||||
401: function (err) {
|
||||
// This will be a custom error
|
||||
alert(err.responseText);
|
||||
}
|
||||
}
|
||||
}).done(function (data) {
|
||||
|
||||
@@ -964,3 +964,25 @@ class ChangeDetectionStore:
|
||||
f_d.write(zlib.compress(f_j.read()))
|
||||
os.unlink(json_path)
|
||||
|
||||
def add_notification_url(self, notification_url):
|
||||
|
||||
logger.debug(f">>> Adding new notification_url - '{notification_url}'")
|
||||
|
||||
notification_urls = self.data['settings']['application'].get('notification_urls', [])
|
||||
|
||||
if notification_url in notification_urls:
|
||||
return notification_url
|
||||
|
||||
with self.lock:
|
||||
notification_urls = self.__data['settings']['application'].get('notification_urls', [])
|
||||
|
||||
if notification_url in notification_urls:
|
||||
return notification_url
|
||||
|
||||
# Append and update the datastore
|
||||
notification_urls.append(notification_url)
|
||||
self.__data['settings']['application']['notification_urls'] = notification_urls
|
||||
self.needs_write = True
|
||||
|
||||
return notification_url
|
||||
|
||||
|
||||
@@ -98,15 +98,13 @@
|
||||
|
||||
|
||||
{% macro playwright_warning() %}
|
||||
<p><strong>Error - Playwright support for Chrome based fetching is not enabled.</strong> Alternatively try our <a href="https://changedetection.io">very affordable subscription based service which has all this setup for you</a>.</p>
|
||||
<p><strong>Error - This watch needs Chrome (with playwright/sockpuppetbrowser), but Chrome based fetching is not enabled.</strong> Alternatively try our <a href="https://changedetection.io">very affordable subscription based service which has all this setup for you</a>.</p>
|
||||
<p>You may need to <a href="https://github.com/dgtlmoon/changedetection.io/blob/09ebc6ec6338545bdd694dc6eee57f2e9d2b8075/docker-compose.yml#L31">Enable playwright environment variable</a> and uncomment the <strong>sockpuppetbrowser</strong> in the <a href="https://github.com/dgtlmoon/changedetection.io/blob/master/docker-compose.yml">docker-compose.yml</a> file.</p>
|
||||
<br>
|
||||
<p>(Also Selenium/WebDriver can not extract full page screenshots reliably so Playwright is recommended here)</p>
|
||||
|
||||
{% endmacro %}
|
||||
|
||||
{% macro only_webdriver_type_watches_warning() %}
|
||||
<p><strong>Sorry, this functionality only works with Playwright/Chrome enabled watches.<br>You need to <a href="#request">Set the fetch method to Playwright/Chrome mode and resave</a> and have the Playwright connection enabled.</strong></p><br>
|
||||
{% macro only_playwright_type_watches_warning() %}
|
||||
<p><strong>Sorry, this functionality only works with Playwright/Chrome enabled watches.<br>You need to <a href="#request">Set the fetch method to Playwright/Chrome mode and resave</a> and have the SockpuppetBrowser/Playwright or Selenium enabled.</strong></p><br>
|
||||
{% endmacro %}
|
||||
|
||||
{% macro render_time_schedule_form(form, available_timezones, timezone_default_config) %}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{% extends 'base.html' %}
|
||||
{% block content %}
|
||||
{% from '_helpers.html' import render_field, render_checkbox_field, render_button, render_time_schedule_form, playwright_warning, only_webdriver_type_watches_warning, render_conditions_fieldlist_of_formfields_as_table %}
|
||||
{% from '_helpers.html' import render_field, render_checkbox_field, render_button, render_time_schedule_form, playwright_warning, only_playwright_type_watches_warning, render_conditions_fieldlist_of_formfields_as_table %}
|
||||
{% from '_common_fields.html' import render_common_settings_form %}
|
||||
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
|
||||
<script src="{{url_for('static_content', group='js', filename='vis.js')}}" defer></script>
|
||||
@@ -204,7 +204,9 @@ Math: {{ 1 + 1 }}") }}
|
||||
</div>
|
||||
|
||||
<div class="tab-pane-inner" id="browser-steps">
|
||||
{% if playwright_enabled and watch_uses_webdriver %}
|
||||
{% if watch_needs_selenium_or_playwright %}
|
||||
{# Only works with playwright #}
|
||||
{% if system_has_playwright_configured %}
|
||||
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality">
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
@@ -223,7 +225,6 @@ Math: {{ 1 + 1 }}") }}
|
||||
<div class="flex-wrapper" >
|
||||
|
||||
<div id="browser-steps-ui" class="noselect">
|
||||
|
||||
<div class="noselect" id="browsersteps-selector-wrapper" style="width: 100%">
|
||||
<span class="loader" >
|
||||
<span id="browsersteps-click-start">
|
||||
@@ -245,15 +246,16 @@ Math: {{ 1 + 1 }}") }}
|
||||
</div>
|
||||
</fieldset>
|
||||
{% else %}
|
||||
<span class="pure-form-message-inline">
|
||||
{% if not watch_uses_webdriver %}
|
||||
{{ only_webdriver_type_watches_warning() }}
|
||||
{% endif %}
|
||||
{% if not playwright_enabled %}
|
||||
{{ playwright_warning() }}
|
||||
{% endif %}
|
||||
</span>
|
||||
{# it's configured to use selenium or chrome but system says its not configured #}
|
||||
{{ playwright_warning() }}
|
||||
{% if system_has_webdriver_configured %}
|
||||
<strong>Selenium/Webdriver cant be used here because it wont fetch screenshots reliably.</strong>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{# "This functionality needs chrome.." #}
|
||||
{{ only_playwright_type_watches_warning() }}
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
|
||||
@@ -262,7 +264,7 @@ Math: {{ 1 + 1 }}") }}
|
||||
<div class="pure-control-group inline-radio">
|
||||
{{ render_checkbox_field(form.notification_muted) }}
|
||||
</div>
|
||||
{% if watch_uses_webdriver %}
|
||||
{% if watch_needs_selenium_or_playwright %}
|
||||
<div class="pure-control-group inline-radio">
|
||||
{{ render_checkbox_field(form.notification_screenshot) }}
|
||||
<span class="pure-form-message-inline">
|
||||
@@ -314,61 +316,8 @@ Math: {{ 1 + 1 }}") }}
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="pure-control-group">
|
||||
{% set field = render_field(form.include_filters,
|
||||
rows=5,
|
||||
placeholder=has_tag_filters_extra+"#example
|
||||
xpath://body/div/span[contains(@class, 'example-class')]",
|
||||
class="m-d")
|
||||
%}
|
||||
{{ field }}
|
||||
{% if '/text()' in field %}
|
||||
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br>
|
||||
{% endif %}
|
||||
<span class="pure-form-message-inline">One CSS, xPath 1 & 2, JSON Path/JQ selector per line, <i>any</i> rules that matches will be used.<br>
|
||||
<span data-target="#advanced-help-selectors" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</span><br>
|
||||
<ul id="advanced-help-selectors" style="display: none;">
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
|
||||
<ul>
|
||||
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
|
||||
{% if jq_support %}
|
||||
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>. Prefix <code>jqraw:</code> outputs the results as text instead of a JSON list.</li>
|
||||
{% else %}
|
||||
<li>jq support not installed</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</li>
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with <code>xpath:</code>
|
||||
<ul>
|
||||
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath:count(//*[contains(@class, 'sametext')])</code>, <a
|
||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
|
||||
<li>To use XPath1.0: Prefix with <code>xpath1:</code></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a
|
||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
|
||||
footer
|
||||
nav
|
||||
.stockticker
|
||||
//*[contains(text(), 'Advertisement')]") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
|
||||
<li> Don't paste HTML here, use only CSS and XPath selectors </li>
|
||||
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
|
||||
</ul>
|
||||
</span>
|
||||
</fieldset>
|
||||
{% include "edit/include_subtract.html" %}
|
||||
<div class="text-filtering border-fieldset">
|
||||
<fieldset class="pure-group" id="text-filtering-type-options">
|
||||
<h3>Text filtering</h3>
|
||||
@@ -396,76 +345,9 @@ nav
|
||||
{{ render_checkbox_field(form.trim_text_whitespace) }}
|
||||
<span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Text to wait for before triggering a change/notification, all text and regex are tested <i>case-insensitive</i>.</li>
|
||||
<li>Trigger text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
|
||||
<li>Each line is processed separately (think of each line as "OR")</li>
|
||||
<li>Note: Wrap in forward slash / to use regex example: <code>/foo\d/</code></li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
<fieldset class="pure-group">
|
||||
{{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Matching text will be <strong>ignored</strong> in the text snapshot (you can still see it but it wont trigger a change)</li>
|
||||
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
|
||||
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
|
||||
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
|
||||
</ul>
|
||||
</span>
|
||||
|
||||
</fieldset>
|
||||
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock
|
||||
Sold out
|
||||
Not in stock
|
||||
Unavailable") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Block change-detection while this text is on the page, all text and regex are tested <i>case-insensitive</i>, good for waiting for when a product is available again</li>
|
||||
<li>Block text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
|
||||
<li>All lines here must not exist (think of each line as "OR")</li>
|
||||
<li>Note: Wrap in forward slash / to use regex example: <code>/foo\d/</code></li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.extract_text, rows=5, placeholder="/.+?\d+ comments.+?/
|
||||
or
|
||||
keyword") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Extracts text in the final output (line by line) after other filters using regular expressions or string match;
|
||||
<ul>
|
||||
<li>Regular expression ‐ example <code>/reports.+?2022/i</code></li>
|
||||
<li>Don't forget to consider the white-space at the start of a line <code>/.+?reports.+?2022/i</code></li>
|
||||
<li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br></li>
|
||||
<li>Keyword example ‐ example <code>Out of stock</code></li>
|
||||
<li>Use groups to extract just that text ‐ example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
|
||||
<li>Example - match lines containing a keyword <code>/.*icecream.*/</code></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>One line per regular-expression/string match</li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
{% include "edit/text-options.html" %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="text-preview" style="display: none;" >
|
||||
<script>
|
||||
const preview_text_edit_filters_url="{{url_for('ui.ui_edit.watch_get_preview_rendered', uuid=uuid)}}";
|
||||
@@ -499,13 +381,15 @@ keyword") }}
|
||||
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{% if playwright_enabled and watch_uses_webdriver %}
|
||||
{% if watch_needs_selenium_or_playwright %}
|
||||
{% if system_has_playwright_configured %}
|
||||
<span class="pure-form-message-inline" id="visual-selector-heading">
|
||||
The Visual Selector tool lets you select the <i>text</i> elements that will be used for the change detection. It automatically fills-in the filters in the "CSS/JSONPath/JQ/XPath Filters" box of the <a href="#filters-and-triggers">Filters & Triggers</a> tab. Use <strong>Shift+Click</strong> to select multiple items.
|
||||
</span>
|
||||
|
||||
<div id="selector-header">
|
||||
<a id="clear-selector" class="pure-button button-secondary button-xsmall" style="font-size: 70%">Clear selection</a>
|
||||
<!-- visual selector IMG will try to load, it will either replace this or on error replace it with some handy text -->
|
||||
<i class="fetching-update-notice" style="font-size: 80%;">One moment, fetching screenshot and element information..</i>
|
||||
</div>
|
||||
<div id="selector-wrapper" style="display: none">
|
||||
@@ -517,13 +401,16 @@ keyword") }}
|
||||
</div>
|
||||
<div id="selector-current-xpath" style="overflow-x: hidden"><strong>Currently:</strong> <span class="text">Loading...</span></div>
|
||||
{% else %}
|
||||
{% if not watch_uses_webdriver %}
|
||||
{{ only_webdriver_type_watches_warning() }}
|
||||
{% endif %}
|
||||
{% if not playwright_enabled %}
|
||||
{{ playwright_warning() }}
|
||||
{% endif %}
|
||||
{# The watch needed chrome but system says that playwright is not ready #}
|
||||
{{ playwright_warning() }}
|
||||
{% endif %}
|
||||
{% if system_has_webdriver_configured %}
|
||||
<strong>Selenium/Webdriver cant be used here because it wont fetch screenshots reliably.</strong>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{# "This functionality needs chrome.." #}
|
||||
{{ only_playwright_type_watches_warning() }}
|
||||
{% endif %}
|
||||
</div>
|
||||
</fieldset>
|
||||
</div>
|
||||
@@ -563,6 +450,13 @@ keyword") }}
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
{% if ui_edit_stats_extras %}
|
||||
<div class="plugin-stats-extras"> <!-- from pluggy plugin -->
|
||||
{{ ui_edit_stats_extras|safe }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if watch.history_n %}
|
||||
<p>
|
||||
<a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">Download latest HTML snapshot</a>
|
||||
|
||||
55
changedetectionio/templates/edit/include_subtract.html
Normal file
55
changedetectionio/templates/edit/include_subtract.html
Normal file
@@ -0,0 +1,55 @@
|
||||
<div class="pure-control-group">
|
||||
{% set field = render_field(form.include_filters,
|
||||
rows=5,
|
||||
placeholder=has_tag_filters_extra+"#example
|
||||
xpath://body/div/span[contains(@class, 'example-class')]",
|
||||
class="m-d")
|
||||
%}
|
||||
{{ field }}
|
||||
{% if '/text()' in field %}
|
||||
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br>
|
||||
{% endif %}
|
||||
<span class="pure-form-message-inline">One CSS, xPath 1 & 2, JSON Path/JQ selector per line, <i>any</i> rules that matches will be used.<br>
|
||||
<span data-target="#advanced-help-selectors" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</span><br>
|
||||
<ul id="advanced-help-selectors" style="display: none;">
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
|
||||
<ul>
|
||||
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
|
||||
{% if jq_support %}
|
||||
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>. Prefix <code>jqraw:</code> outputs the results as text instead of a JSON list.</li>
|
||||
{% else %}
|
||||
<li>jq support not installed</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</li>
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with <code>xpath:</code>
|
||||
<ul>
|
||||
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath:count(//*[contains(@class, 'sametext')])</code>, <a
|
||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
|
||||
<li>To use XPath1.0: Prefix with <code>xpath1:</code></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a
|
||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
|
||||
footer
|
||||
nav
|
||||
.stockticker
|
||||
//*[contains(text(), 'Advertisement')]") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
|
||||
<li> Don't paste HTML here, use only CSS and XPath selectors </li>
|
||||
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
|
||||
</ul>
|
||||
</span>
|
||||
</fieldset>
|
||||
69
changedetectionio/templates/edit/text-options.html
Normal file
69
changedetectionio/templates/edit/text-options.html
Normal file
@@ -0,0 +1,69 @@
|
||||
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Text to wait for before triggering a change/notification, all text and regex are tested <i>case-insensitive</i>.</li>
|
||||
<li>Trigger text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
|
||||
<li>Each line is processed separately (think of each line as "OR")</li>
|
||||
<li>Note: Wrap in forward slash / to use regex example: <code>/foo\d/</code></li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
<fieldset class="pure-group">
|
||||
{{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Matching text will be <strong>ignored</strong> in the text snapshot (you can still see it but it wont trigger a change)</li>
|
||||
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
|
||||
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
|
||||
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
|
||||
</ul>
|
||||
</span>
|
||||
|
||||
</fieldset>
|
||||
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock
|
||||
Sold out
|
||||
Not in stock
|
||||
Unavailable") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Block change-detection while this text is on the page, all text and regex are tested <i>case-insensitive</i>, good for waiting for when a product is available again</li>
|
||||
<li>Block text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
|
||||
<li>All lines here must not exist (think of each line as "OR")</li>
|
||||
<li>Note: Wrap in forward slash / to use regex example: <code>/foo\d/</code></li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
{{ render_field(form.extract_text, rows=5, placeholder="/.+?\d+ comments.+?/
|
||||
or
|
||||
keyword") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Extracts text in the final output (line by line) after other filters using regular expressions or string match;
|
||||
<ul>
|
||||
<li>Regular expression ‐ example <code>/reports.+?2022/i</code></li>
|
||||
<li>Don't forget to consider the white-space at the start of a line <code>/.+?reports.+?2022/i</code></li>
|
||||
<li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br></li>
|
||||
<li>Keyword example ‐ example <code>Out of stock</code></li>
|
||||
<li>Use groups to extract just that text ‐ example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
|
||||
<li>Example - match lines containing a keyword <code>/.*icecream.*/</code></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>One line per regular-expression/string match</li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
</fieldset>
|
||||
108
changedetectionio/tests/test_api_notifications.py
Normal file
108
changedetectionio/tests/test_api_notifications.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from flask import url_for
|
||||
from .util import live_server_setup
|
||||
import json
|
||||
|
||||
def test_api_notifications_crud(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
# Confirm notifications are initially empty
|
||||
res = client.get(
|
||||
url_for("notifications"),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert res.json == {"notification_urls": []}
|
||||
|
||||
# Add notification URLs
|
||||
test_urls = ["posts://example.com/notify1", "posts://example.com/notify2"]
|
||||
res = client.post(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": test_urls}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 201
|
||||
for url in test_urls:
|
||||
assert url in res.json["notification_urls"]
|
||||
|
||||
# Confirm the notification URLs were added
|
||||
res = client.get(
|
||||
url_for("notifications"),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
for url in test_urls:
|
||||
assert url in res.json["notification_urls"]
|
||||
|
||||
# Delete one notification URL
|
||||
res = client.delete(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": [test_urls[0]]}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 204
|
||||
|
||||
# Confirm it was removed and the other remains
|
||||
res = client.get(
|
||||
url_for("notifications"),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert test_urls[0] not in res.json["notification_urls"]
|
||||
assert test_urls[1] in res.json["notification_urls"]
|
||||
|
||||
# Try deleting a non-existent URL
|
||||
res = client.delete(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": ["posts://nonexistent.com"]}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 400
|
||||
|
||||
res = client.post(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": test_urls}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 201
|
||||
|
||||
# Replace with a new list
|
||||
replacement_urls = ["posts://new.example.com"]
|
||||
res = client.put(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": replacement_urls}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert res.json["notification_urls"] == replacement_urls
|
||||
|
||||
# Replace with an empty list
|
||||
res = client.put(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": []}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert res.json["notification_urls"] == []
|
||||
|
||||
# Provide an invalid AppRise URL to trigger validation error
|
||||
invalid_urls = ["ftp://not-app-rise"]
|
||||
res = client.post(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": invalid_urls}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 400
|
||||
assert "is not a valid AppRise URL." in res.data.decode()
|
||||
|
||||
res = client.put(
|
||||
url_for("notifications"),
|
||||
data=json.dumps({"notification_urls": invalid_urls}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 400
|
||||
assert "is not a valid AppRise URL." in res.data.decode()
|
||||
|
||||
|
||||
@@ -45,11 +45,15 @@ def set_number_out_of_range_response(number="150"):
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
def test_setup(client, live_server):
|
||||
"""Test that both text and number conditions work together with AND logic."""
|
||||
live_server_setup(live_server)
|
||||
|
||||
def test_conditions_with_text_and_number(client, live_server):
|
||||
"""Test that both text and number conditions work together with AND logic."""
|
||||
|
||||
set_original_response("50")
|
||||
live_server_setup(live_server)
|
||||
#live_server_setup(live_server)
|
||||
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
|
||||
@@ -195,3 +199,40 @@ def test_condition_validate_rule_row(client, live_server):
|
||||
|
||||
|
||||
|
||||
|
||||
# If there was only a change in the whitespacing, then we shouldnt have a change detected
|
||||
def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage):
|
||||
#live_server_setup(live_server)
|
||||
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text<br>
|
||||
<p>Which is across multiple lines</p>
|
||||
<br>
|
||||
So let's see what happens. <br>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Give the thread time to pick it up
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Check it saved
|
||||
res = client.get(
|
||||
url_for("ui.ui_edit.edit_page", uuid="first"),
|
||||
)
|
||||
|
||||
# Assert the word count is counted correctly
|
||||
assert b'<td>13</td>' in res.data
|
||||
@@ -32,7 +32,6 @@ def test_strip_regex_text_func():
|
||||
]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
|
||||
|
||||
assert "but 1 lines" in stripped_content
|
||||
assert "igNORe-cAse text" not in stripped_content
|
||||
assert "but 1234 lines" not in stripped_content
|
||||
@@ -42,6 +41,46 @@ def test_strip_regex_text_func():
|
||||
# Check line number reporting
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
|
||||
assert stripped_content == [2, 5, 6, 7, 8, 10]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'])
|
||||
assert "but 1 lines" not in stripped_content
|
||||
assert "skip 5 lines" not in stripped_content
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'], mode="line numbers")
|
||||
assert stripped_content == [4, 5]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'])
|
||||
assert stripped_content == ""
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'], mode="line numbers")
|
||||
assert stripped_content == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'])
|
||||
assert "but 1 lines" not in stripped_content
|
||||
assert "skip 5 lines" not in stripped_content
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'], mode="line numbers")
|
||||
assert stripped_content == [4, 5]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'])
|
||||
assert "but sometimes we want to remove the lines." not in stripped_content
|
||||
assert "but not always." not in stripped_content
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'], mode="line numbers")
|
||||
assert stripped_content == [2, 11]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'])
|
||||
assert "but sometimes we want to remove the lines." not in stripped_content
|
||||
assert "but 1 lines" not in stripped_content
|
||||
assert "but 1234 lines" not in stripped_content
|
||||
assert "igNORe-cAse text we dont want to keep" not in stripped_content
|
||||
assert "but not always." not in stripped_content
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'], mode="line numbers")
|
||||
assert stripped_content == [2, 3, 4, 9, 10, 11]
|
||||
|
||||
stripped_content = html_tools.strip_ignore_text("\n\ntext\n\ntext\n\n", ['/^$/ms'], mode="line numbers")
|
||||
assert stripped_content == [1, 2, 4, 6]
|
||||
|
||||
# Check that linefeeds are preserved when there are is no matching ignores
|
||||
content = "some text\n\nand other text\n"
|
||||
|
||||
@@ -32,13 +32,14 @@ def test_strip_text_func():
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore)
|
||||
assert stripped_content == "Some initial text\n\nWhich is across multiple lines\n\n\n\nSo let's see what happens."
|
||||
|
||||
def set_original_ignore_response():
|
||||
test_return_data = """<html>
|
||||
def set_original_ignore_response(ver_stamp="123"):
|
||||
test_return_data = f"""<html>
|
||||
<body>
|
||||
Some initial text<br>
|
||||
<p>Which is across multiple lines</p>
|
||||
<br>
|
||||
So let's see what happens. <br>
|
||||
<link href="https://www.somesite/wp-content/themes/cooltheme/style2.css?v={ver_stamp}" rel="stylesheet"/>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -48,13 +49,14 @@ def set_original_ignore_response():
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
def set_modified_original_ignore_response():
|
||||
test_return_data = """<html>
|
||||
def set_modified_original_ignore_response(ver_stamp="123"):
|
||||
test_return_data = f"""<html>
|
||||
<body>
|
||||
Some NEW nice initial text<br>
|
||||
<p>Which is across multiple lines</p>
|
||||
<br>
|
||||
So let's see what happens. <br>
|
||||
<link href="https://www.somesite/wp-content/themes/cooltheme/style2.css?v={ver_stamp}" rel="stylesheet"/>
|
||||
<p>new ignore stuff</p>
|
||||
<p>blah</p>
|
||||
</body>
|
||||
@@ -67,14 +69,15 @@ def set_modified_original_ignore_response():
|
||||
|
||||
|
||||
# Is the same but includes ZZZZZ, 'ZZZZZ' is the last line in ignore_text
|
||||
def set_modified_ignore_response():
|
||||
test_return_data = """<html>
|
||||
def set_modified_ignore_response(ver_stamp="123"):
|
||||
test_return_data = f"""<html>
|
||||
<body>
|
||||
Some initial text<br>
|
||||
<p>Which is across multiple lines</p>
|
||||
<P>ZZZZz</P>
|
||||
<br>
|
||||
So let's see what happens. <br>
|
||||
<link href="https://www.somesite/wp-content/themes/cooltheme/style2.css?v={ver_stamp}" rel="stylesheet"/>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -165,9 +168,9 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
# When adding some ignore text, it should not trigger a change, even if something else on that line changes
|
||||
def test_check_global_ignore_text_functionality(client, live_server, measure_memory_usage):
|
||||
#live_server_setup(live_server)
|
||||
ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ"
|
||||
def _run_test_global_ignore(client, as_source=False, extra_ignore=""):
|
||||
ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ\r\n"+extra_ignore
|
||||
|
||||
set_original_ignore_response()
|
||||
|
||||
# Goto the settings page, add our ignore text
|
||||
@@ -186,6 +189,10 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
if as_source:
|
||||
# Switch to source mode so we can test that too!
|
||||
test_url = "source:"+test_url
|
||||
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
@@ -203,12 +210,15 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
|
||||
wait_for_all_checks(client)
|
||||
# Check it saved
|
||||
res = client.get(
|
||||
url_for("settings.settings_page"),
|
||||
)
|
||||
assert bytes(ignore_text.encode('utf-8')) in res.data
|
||||
|
||||
for i in ignore_text.splitlines():
|
||||
assert bytes(i.encode('utf-8')) in res.data
|
||||
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
@@ -221,7 +231,8 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
|
||||
|
||||
# Make a change which includes the ignore text, it should be ignored and no 'change' triggered
|
||||
# It adds text with "ZZZZzzzz" and "ZZZZ" is in the ignore list
|
||||
set_modified_ignore_response()
|
||||
# And tweaks the ver_stamp which should be picked up by global regex ignore
|
||||
set_modified_ignore_response(ver_stamp=time.time())
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
@@ -243,3 +254,11 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_global_ignore_text_functionality(client, live_server):
|
||||
#live_server_setup(live_server)
|
||||
_run_test_global_ignore(client, as_source=False)
|
||||
|
||||
def test_check_global_ignore_text_functionality_as_source(client, live_server):
|
||||
#live_server_setup(live_server)
|
||||
_run_test_global_ignore(client, as_source=True, extra_ignore='/\?v=\d/')
|
||||
|
||||
@@ -167,7 +167,10 @@ def test_check_notification(client, live_server, measure_memory_usage):
|
||||
assert ':-)' in notification_submission
|
||||
# Check the attachment was added, and that it is a JPEG from the original PNG
|
||||
notification_submission_object = json.loads(notification_submission)
|
||||
assert notification_submission_object
|
||||
|
||||
# We keep PNG screenshots for now
|
||||
# IF THIS FAILS YOU SHOULD BE TESTING WITH ENV VAR REMOVE_REQUESTS_OLD_SCREENSHOTS=False
|
||||
assert notification_submission_object['attachments'][0]['filename'] == 'last-screenshot.png'
|
||||
assert len(notification_submission_object['attachments'][0]['base64'])
|
||||
assert notification_submission_object['attachments'][0]['mimetype'] == 'image/png'
|
||||
|
||||
80
changedetectionio/tests/test_ui.py
Normal file
80
changedetectionio/tests/test_ui.py
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from flask import url_for
|
||||
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
|
||||
|
||||
def test_checkbox_open_diff_in_new_tab(client, live_server):
|
||||
|
||||
set_original_response()
|
||||
live_server_setup(live_server)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": url_for('test_endpoint', _external=True)},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Make a change
|
||||
set_modified_response()
|
||||
|
||||
# Test case 1 - checkbox is enabled in settings
|
||||
res = client.post(
|
||||
url_for("settings.settings_page"),
|
||||
data={"application-ui-open_diff_in_new_tab": "1"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b'Settings updated' in res.data
|
||||
|
||||
# Force recheck
|
||||
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
assert b'Queued 1 watch for rechecking.' in res.data
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
lines = res.data.decode().split("\n")
|
||||
|
||||
# Find link to diff page
|
||||
target_line = None
|
||||
for line in lines:
|
||||
if '/diff' in line:
|
||||
target_line = line.strip()
|
||||
break
|
||||
|
||||
assert target_line != None
|
||||
assert 'target=' in target_line
|
||||
|
||||
# Test case 2 - checkbox is disabled in settings
|
||||
res = client.post(
|
||||
url_for("settings.settings_page"),
|
||||
data={"application-ui-open_diff_in_new_tab": ""},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b'Settings updated' in res.data
|
||||
|
||||
# Force recheck
|
||||
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
assert b'Queued 1 watch for rechecking.' in res.data
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
lines = res.data.decode().split("\n")
|
||||
|
||||
# Find link to diff page
|
||||
target_line = None
|
||||
for line in lines:
|
||||
if '/diff' in line:
|
||||
target_line = line.strip()
|
||||
break
|
||||
|
||||
assert target_line != None
|
||||
assert 'target=' not in target_line
|
||||
|
||||
# Cleanup everything
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
@@ -109,7 +109,6 @@ class update_worker(threading.Thread):
|
||||
default_notification_title
|
||||
)
|
||||
|
||||
|
||||
# Would be better if this was some kind of Object where Watch can reference the parent datastore etc
|
||||
v = watch.get(var_name)
|
||||
if v and not watch.get('notification_muted'):
|
||||
@@ -592,6 +591,7 @@ class update_worker(threading.Thread):
|
||||
|
||||
self.current_uuid = None # Done
|
||||
self.q.task_done()
|
||||
update_handler = None
|
||||
logger.debug(f"Watch {uuid} done in {time.time()-fetch_start_time:.2f}s")
|
||||
|
||||
# Give the CPU time to interrupt
|
||||
|
||||
@@ -9,15 +9,20 @@ services:
|
||||
# - ./proxies.json:/datastore/proxies.json
|
||||
|
||||
# environment:
|
||||
# Default listening port, can also be changed with the -p option
|
||||
# Default listening port, can also be changed with the -p option (not to be confused with ports: below)
|
||||
# - PORT=5000
|
||||
#
|
||||
# Log levels are in descending order. (TRACE is the most detailed one)
|
||||
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
|
||||
# - LOGGER_LEVEL=TRACE
|
||||
#
|
||||
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
||||
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
||||
#
|
||||
# Uncomment below and the "sockpuppetbrowser" to use a real Chrome browser (It uses the "playwright" protocol)
|
||||
# - PLAYWRIGHT_DRIVER_URL=ws://browser-sockpuppet-chrome:3000
|
||||
#
|
||||
#
|
||||
# Alternative WebDriver/selenium URL, do not use "'s or 's! (old, deprecated, does not support screenshots very well)
|
||||
# - WEBDRIVER_URL=http://browser-selenium-chrome:4444/wd/hub
|
||||
#
|
||||
# WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_noProxy,
|
||||
# webdriver_proxyAutoconfigUrl, webdriver_autodetect,
|
||||
@@ -25,9 +30,6 @@ services:
|
||||
#
|
||||
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
||||
#
|
||||
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
|
||||
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
|
||||
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
|
||||
#
|
||||
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
|
||||
#
|
||||
@@ -43,7 +45,7 @@ services:
|
||||
# Base URL of your changedetection.io install (Added to the notification alert)
|
||||
# - BASE_URL=https://mysite.com
|
||||
# Respect proxy_pass type settings, `proxy_set_header Host "localhost";` and `proxy_set_header X-Forwarded-Prefix /app;`
|
||||
# More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy-sub-directory
|
||||
# More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy
|
||||
# - USE_X_SETTINGS=1
|
||||
#
|
||||
# Hides the `Referer` header so that monitored websites can't see the changedetection.io hostname.
|
||||
@@ -63,6 +65,10 @@ services:
|
||||
#
|
||||
# A valid timezone name to run as (for scheduling watch checking) see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||
# - TZ=America/Los_Angeles
|
||||
#
|
||||
# Maximum height of screenshots, default is 16000 px, screenshots will be clipped to this if exceeded.
|
||||
# RAM usage will be higher if you increase this.
|
||||
# - SCREENSHOT_MAX_HEIGHT=16000
|
||||
|
||||
# Comment out ports: when using behind a reverse proxy , enable networks: etc.
|
||||
ports:
|
||||
@@ -82,8 +88,8 @@ services:
|
||||
|
||||
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
|
||||
# RECOMMENDED FOR FETCHING PAGES WITH CHROME, be sure to enable the "PLAYWRIGHT_DRIVER_URL" env variable in the main changedetection container
|
||||
# sockpuppetbrowser:
|
||||
# hostname: sockpuppetbrowser
|
||||
# browser-sockpuppet-chrome:
|
||||
# hostname: browser-sockpuppet-chrome
|
||||
# image: dgtlmoon/sockpuppetbrowser:latest
|
||||
# cap_add:
|
||||
# - SYS_ADMIN
|
||||
@@ -98,14 +104,18 @@ services:
|
||||
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
||||
# Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
|
||||
# Does not report status codes (200, 404, 403) and other issues
|
||||
# browser-chrome:
|
||||
# hostname: browser-chrome
|
||||
# browser-selenium-chrome:
|
||||
# hostname: browser-selenium-chrome
|
||||
# image: selenium/standalone-chrome:4
|
||||
# environment:
|
||||
# - VNC_NO_PASSWORD=1
|
||||
# - SCREEN_WIDTH=1920
|
||||
# - SCREEN_HEIGHT=1080
|
||||
# - SCREEN_DEPTH=24
|
||||
# CHROME_OPTIONS: |
|
||||
# --window-size=1280,1024
|
||||
# --headless
|
||||
# --disable-gpu
|
||||
# volumes:
|
||||
# # Workaround to avoid the browser crashing inside a docker container
|
||||
# # See https://github.com/SeleniumHQ/docker-selenium#quick-start
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -5,13 +5,13 @@
|
||||
<meta name="description" content="Manage your changedetection.io watches via API, requires the `x-api-key` header which is found in the settings UI.">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<link href="assets/bootstrap.min.css?v=1701595483622" rel="stylesheet" media="screen">
|
||||
<link href="assets/prism.css?v=1701595483622" rel="stylesheet" />
|
||||
<link href="assets/main.css?v=1701595483622" rel="stylesheet" media="screen, print">
|
||||
<link href="assets/favicon.ico?v=1701595483622" rel="icon" type="image/x-icon">
|
||||
<link href="assets/apple-touch-icon.png?v=1701595483622" rel="apple-touch-icon" sizes="180x180">
|
||||
<link href="assets/favicon-32x32.png?v=1701595483622" rel="icon" type="image/png" sizes="32x32">
|
||||
<link href="assets/favicon-16x16.png?v=1701595483622" rel="icon" type="image/png" sizes="16x16">
|
||||
<link href="assets/bootstrap.min.css?v=1744573753999" rel="stylesheet" media="screen">
|
||||
<link href="assets/prism.css?v=1744573753999" rel="stylesheet" />
|
||||
<link href="assets/main.css?v=1744573753999" rel="stylesheet" media="screen, print">
|
||||
<link href="assets/favicon.ico?v=1744573753999" rel="icon" type="image/x-icon">
|
||||
<link href="assets/apple-touch-icon.png?v=1744573753999" rel="apple-touch-icon" sizes="180x180">
|
||||
<link href="assets/favicon-32x32.png?v=1744573753999" rel="icon" type="image/png" sizes="32x32">
|
||||
<link href="assets/favicon-16x16.png?v=1744573753999" rel="icon" type="image/png" sizes="16x16">
|
||||
</head>
|
||||
|
||||
<body class="container-fluid">
|
||||
@@ -928,6 +928,6 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="assets/main.bundle.js?v=1701595483622"></script>
|
||||
<script src="assets/main.bundle.js?v=1744573753999"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
BIN
docs/web-page-change-conditions.png
Normal file
BIN
docs/web-page-change-conditions.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 104 KiB |
@@ -110,3 +110,6 @@ pluggy ~= 1.5
|
||||
|
||||
# Needed for testing, cross-platform for process and system monitoring
|
||||
psutil==7.0.0
|
||||
|
||||
ruff >= 0.11.2
|
||||
pre_commit >= 4.2.0
|
||||
|
||||
Reference in New Issue
Block a user