Compare commits

...

12 Commits

Author SHA1 Message Date
dgtlmoon
344f25b412 Add flask_wtf 2022-03-21 22:45:56 +01:00
dgtlmoon
3702973c7f Remove extra files 2022-03-21 22:41:52 +01:00
dgtlmoon
b8286c829a Adding CSRF protection
Disable during tests
2022-03-21 22:40:11 +01:00
dgtlmoon
dc1594b04f Use a safer POST for removepassword, adjust tests 2022-03-21 22:07:05 +01:00
dgtlmoon
f2fa638480 Security update - Protect against file:/// type access by webdriver/chrome. (#483) 2022-03-21 20:59:20 +01:00
dgtlmoon
82d1a7f73e Only build container on GitHub releases, not tests 2022-03-20 16:57:36 +01:00
dgtlmoon
9fc291fb63 Also change container names to help stop some DNS issues 2022-03-17 19:59:37 +01:00
dgtlmoon
3e8a15456a Detect byte-encoding when the server mishandles the content-type header reply (#472) 2022-03-17 10:28:02 +01:00
dgtlmoon
2a03f3f57e Improving form/edit example markup 2022-03-13 12:00:45 +01:00
dgtlmoon
ffad5cca97 JSON diff/preview should use utf-8 encoding where possible (#465) 2022-03-13 11:37:51 +01:00
Tim Loderhose
60a9a786e0 Fix typo in settings form 2022-03-13 10:55:37 +01:00
dgtlmoon
165e950e55 Add python venv to .gitignore 2022-03-13 10:53:33 +01:00
25 changed files with 229 additions and 58 deletions

View File

@@ -2,11 +2,11 @@ name: Build and push containers
on:
# Automatically triggered by a testing workflow passing, but this is only checked when it lands in the `master`/default branch
workflow_run:
workflows: ["ChangeDetection.io Test"]
branches: [master]
tags: ['0.*']
types: [completed]
# workflow_run:
# workflows: ["ChangeDetection.io Test"]
# branches: [master]
# tags: ['0.*']
# types: [completed]
# Or a new tagged release
release:

1
.gitignore vendored
View File

@@ -7,4 +7,5 @@ __pycache__
.pytest_cache
build
dist
venv
.vscode/settings.json

1
changedetectionio/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
test-datastore

View File

@@ -35,6 +35,7 @@ from flask import (
url_for,
)
from flask_login import login_required
from flask_wtf import CSRFProtect
from changedetectionio import html_tools
@@ -72,6 +73,9 @@ app.config['LOGIN_DISABLED'] = False
# Disables caching of the templates
app.config['TEMPLATES_AUTO_RELOAD'] = True
csrf = CSRFProtect()
csrf.init_app(app)
notification_debug_log=[]
def init_app_secret(datastore_path):
@@ -610,16 +614,15 @@ def changedetection_app(config=None, datastore_o=None):
form.notification_format.data = datastore.data['settings']['application']['notification_format']
form.base_url.data = datastore.data['settings']['application']['base_url']
# Password unset is a GET, but we can lock the session to always need the password
if not os.getenv("SALTED_PASS", False) and request.values.get('removepassword') == 'yes':
from pathlib import Path
if request.method == 'POST' and form.data.get('removepassword_button') == True:
# Password unset is a GET, but we can lock the session to a salted env password to always need the password
if not os.getenv("SALTED_PASS", False):
datastore.data['settings']['application']['password'] = False
flash("Password protection removed.", 'notice')
flask_login.logout_user()
return redirect(url_for('settings_page'))
if request.method == 'POST' and form.validate():
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data

View File

@@ -1,10 +1,12 @@
import os
import time
from abc import ABC, abstractmethod
import chardet
import os
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
from selenium.common.exceptions import WebDriverException
import requests
import time
import urllib3.exceptions
@@ -20,7 +22,7 @@ class EmptyReply(Exception):
class Fetcher():
error = None
status_code = None
content = None # Should always be bytes.
content = None
headers = None
fetcher_description ="No description"
@@ -146,7 +148,6 @@ class html_requests(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client"
def run(self, url, timeout, request_headers, request_body, request_method):
import requests
r = requests.request(method=request_method,
data=request_body,
@@ -155,16 +156,21 @@ class html_requests(Fetcher):
timeout=timeout,
verify=False)
# https://stackoverflow.com/questions/44203397/python-requests-get-returns-improperly-decoded-text-instead-of-utf-8
# Return bytes here
html = r.text
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
# For example - some sites don't tell us it's utf-8, but return utf-8 content
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
# https://github.com/psf/requests/issues/1604 good info about requests encoding detection
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
encoding = chardet.detect(r.content)['encoding']
if encoding:
r.encoding = encoding
# @todo test this
# @todo maybe you really want to test zero-byte return pages?
if not r or not html or not len(html):
if not r or not r.content or not len(r.content):
raise EmptyReply(url=url, status_code=r.status_code)
self.status_code = r.status_code
self.content = html
self.content = r.text
self.headers = r.headers

View File

@@ -1,10 +1,10 @@
import hashlib
import os
import re
import time
import urllib3
from inscriptis import get_text
from inscriptis import get_text
from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -24,8 +24,14 @@ class perform_site_check():
stripped_text_from_html = ""
watch = self.datastore.data['watching'][uuid]
# Unset any existing notification error
# Protect against file:// access
if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
raise Exception(
"file:// type access is denied for security reasons."
)
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
extra_headers = self.datastore.get_val(uuid, 'headers')

View File

@@ -350,6 +350,8 @@ class globalSettingsForm(commonSettingsForm):
[validators.NumberRange(min=1)])
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
base_url = StringField('Base URL', validators=[validators.Optional()])
global_subtractive_selectors = StringListField('Ignore elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
ignore_whitespace = BooleanField('Ignore whitespace')
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})

View File

@@ -78,7 +78,8 @@ def _parse_json(json_data, jsonpath_filter):
# Re 265 - Just return an empty string when filter not found
return ''
stripped_text_from_html = json.dumps(s, indent=4)
# Ticket #462 - allow the original encoding through, usually it's UTF-8 or similar
stripped_text_from_html = json.dumps(s, indent=4, ensure_ascii=False)
return stripped_text_from_html

View File

@@ -37,6 +37,9 @@ section.content {
align-items: center;
justify-content: center; }
code {
background: #eee; }
/* table related */
.watch-table {
width: 100%;

View File

@@ -42,6 +42,10 @@ section.content {
justify-content: center;
}
code {
background: #eee;
}
/* table related */
.watch-table {
width: 100%;

View File

@@ -19,6 +19,7 @@
<div class="box-wrap inner">
<form class="pure-form pure-form-stacked"
action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next') ) }}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="general">
<fieldset>
@@ -113,9 +114,9 @@ User-Agent: wonderbra 1.0") }}
<span class="pure-form-message-inline">
<ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <b>//*[contains(@class, 'sametext')]</b>, <a
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code>, <a
href="http://xpather.com/" target="new">test your XPath here</a></li>
</ul>
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
@@ -142,7 +143,7 @@ nav
<span class="pure-form-message-inline">
<ul>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
<li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
<li>Use the preview/show current tab to see ignores</li>
</ul>
@@ -159,7 +160,7 @@ nav
<li>Text to wait for before triggering a change/notification, all text and regex are tested <i>case-insensitive</i>.</li>
<li>Trigger text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
<li>Each line is process separately (think of each line as "OR")</li>
<li>Note: Wrap in forward slash / to use regex example: <span style="font-family: monospace; background: #eee">/foo\d/</span></li>
<li>Note: Wrap in forward slash / to use regex example: <code>/foo\d/</code></li>
</ul>
</span>
</div>

View File

@@ -4,6 +4,7 @@
<div class="edit-form">
<div class="inner">
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset class="pure-group">
<legend>
Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):

View File

@@ -4,6 +4,7 @@
<div class="login-form">
<div class="inner">
<form class="pure-form pure-form-stacked" action="{{url_for('login')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset>
<div class="pure-control-group">
<label for="password">Password</label>

View File

@@ -4,6 +4,7 @@
<div class="edit-form">
<div class="box-wrap inner">
<form class="pure-form pure-form-stacked" action="{{url_for('scrub_page')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset>
<div class="pure-control-group">
This will remove all version snapshots/data, but keep your list of URLs. <br/>

View File

@@ -1,7 +1,7 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.jinja' import render_field %}
{% from '_helpers.jinja' import render_field, render_button %}
{% from '_common_fields.jinja' import render_common_settings_form %}
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='settings.js')}}" defer></script>
@@ -18,6 +18,7 @@
</div>
<div class="box-wrap inner">
<form class="pure-form pure-form-stacked settings" action="{{url_for('settings_page')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="general">
<fieldset>
<div class="pure-control-group">
@@ -27,8 +28,7 @@
<div class="pure-control-group">
{% if not hide_remove_pass %}
{% if current_user.is_authenticated %}
<a href="{{url_for('settings_page', removepassword='yes')}}"
class="pure-button pure-button-primary">Remove password</a>
{{ render_button(form.removepassword_button) }}
{% else %}
{{ render_field(form.password) }}
<span class="pure-form-message-inline">Password protection for your changedetection.io application.</span>
@@ -104,7 +104,7 @@ nav
<ul>
<li>Note: This is applied globally in addition to the per-watch rules.</li>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
<li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
<li>Use the preview/show current tab to see ignores</li>
</ul>
@@ -114,11 +114,9 @@ nav
<div id="actions">
<div class="pure-control-group">
<button type="submit" class="pure-button pure-button-primary">Save</button>
<a href="{{url_for('index')}}" class="pure-button button-small button-cancel">Back</a>
<a href="{{url_for('scrub_page')}}" class="pure-button button-small button-cancel">Delete
History
Snapshot Data</a>
{{ render_button(form.save_button) }}
<a href="{{url_for('index')}}" class="pure-button button-small button-cancel">Back</a>
<a href="{{url_for('scrub_page')}}" class="pure-button button-small button-cancel">Delete History Snapshot Data</a>
</div>
</div>
</form>

View File

@@ -5,6 +5,7 @@
<div class="box">
<form class="pure-form" action="{{ url_for('api_watch_add') }}" method="POST" id="new-watch-form">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset>
<legend>Add a new change detection watch</legend>
{{ render_simple_field(form.url, placeholder="https://...", required=true) }}

View File

@@ -42,6 +42,9 @@ def app(request):
cleanup(app_config['datastore_path'])
datastore = store.ChangeDetectionStore(datastore_path=app_config['datastore_path'], include_default_watches=False)
app = changedetection_app(app_config, datastore)
# Disable CSRF while running tests
app.config['WTF_CSRF_ENABLED'] = False
app.config['STOP_THREADS'] = True
def teardown():

View File

@@ -4,8 +4,8 @@ from flask import url_for
def test_check_access_control(app, client):
# Still doesnt work, but this is closer.
with app.test_client() as c:
# Check we dont have any password protection enabled yet.
with app.test_client(use_cookies=True) as c:
# Check we don't have any password protection enabled yet.
res = c.get(url_for("settings_page"))
assert b"Remove password" not in res.data
@@ -46,15 +46,20 @@ def test_check_access_control(app, client):
assert b"BACKUP" in res.data
assert b"IMPORT" in res.data
assert b"LOG OUT" in res.data
assert b"minutes_between_check" in res.data
assert b"fetch_backend" in res.data
# Now remove the password so other tests function, @todo this should happen before each test automatically
res = c.get(url_for("settings_page", removepassword="yes"),
follow_redirects=True)
assert b"Password protection removed." in res.data
res = c.get(url_for("index"))
assert b"LOG OUT" not in res.data
res = c.post(
url_for("settings_page"),
data={
"minutes_between_check": 180,
"tag": "",
"headers": "",
"fetch_backend": "html_webdriver",
"removepassword_button": "Remove password"
},
follow_redirects=True,
)
# There was a bug where saving the settings form would submit a blank password
def test_check_access_control_no_blank_password(app, client):
@@ -71,8 +76,7 @@ def test_check_access_control_no_blank_password(app, client):
data={"password": "",
"minutes_between_check": 180,
'fetch_backend': "html_requests"},
follow_redirects=True
follow_redirects=True
)
assert b"Password protection enabled." not in res.data
@@ -91,7 +95,8 @@ def test_check_access_no_remote_access_to_remove_password(app, client):
# Enable password check.
res = c.post(
url_for("settings_page"),
data={"password": "password", "minutes_between_check": 180,
data={"password": "password",
"minutes_between_check": 180,
'fetch_backend': "html_requests"},
follow_redirects=True
)
@@ -99,8 +104,17 @@ def test_check_access_no_remote_access_to_remove_password(app, client):
assert b"Password protection enabled." in res.data
assert b"Login" in res.data
res = c.get(url_for("settings_page", removepassword="yes"),
follow_redirects=True)
res = c.post(
url_for("settings_page"),
data={
"minutes_between_check": 180,
"tag": "",
"headers": "",
"fetch_backend": "html_webdriver",
"removepassword_button": "Remove password"
},
follow_redirects=True,
)
assert b"Password protection removed." not in res.data
res = c.get(url_for("index"),

View File

@@ -25,6 +25,7 @@ def test_check_basic_change_detection_functionality(client, live_server):
data={"urls": url_for('test_endpoint', _external=True)},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)

View File

@@ -0,0 +1,87 @@
#!/usr/bin/python3
# coding=utf-8
import time
from flask import url_for
from .util import live_server_setup
import pytest
def test_setup(live_server):
live_server_setup(live_server)
def set_html_response():
test_return_data = """
<html><body><span class="nav_second_img_text">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;铸大国重器,挺制造脊梁,致力能源未来,赋能美好生活。
</span>
</body></html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
# In the case the server does not issue a charset= or doesnt have content_type header set
def test_check_encoding_detection(client, live_server):
set_html_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type="text/html", _external=True)
client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(2)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# Should see the proper string
assert "铸大国重".encode('utf-8') in res.data
# Should not see the failed encoding
assert b'\xc2\xa7' not in res.data
# In the case the server does not issue a charset= or doesnt have content_type header set
def test_check_encoding_detection_missing_content_type_header(client, live_server):
set_html_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(2)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# Should see the proper string
assert "铸大国重".encode('utf-8') in res.data
# Should not see the failed encoding
assert b'\xc2\xa7' not in res.data

View File

@@ -1,4 +1,5 @@
#!/usr/bin/python3
# coding=utf-8
import time
from flask import url_for
@@ -142,7 +143,7 @@ def set_modified_response():
}
],
"boss": {
"name": "Foobar"
"name": "Örnsköldsvik"
},
"available": false
}
@@ -246,8 +247,10 @@ def test_check_json_filter(client, live_server):
# Should not see this, because its not in the JSONPath we entered
res = client.get(url_for("diff_history_page", uuid="first"))
# But the change should be there, tho its hard to test the change was detected because it will show old and new versions
assert b'Foobar' in res.data
# And #462 - check we see the proper utf-8 string there
assert "Örnsköldsvik".encode('utf-8') in res.data
def test_check_json_filter_bool_val(client, live_server):

View File

@@ -0,0 +1,36 @@
from flask import url_for
from . util import set_original_response, set_modified_response, live_server_setup
import time
def test_setup(live_server):
live_server_setup(live_server)
def test_file_access(client, live_server):
res = client.post(
url_for("import_page"),
data={"urls": 'https://localhost'},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Attempt to add a body with a GET method
res = client.post(
url_for("edit_page", uuid="first"),
data={
"url": 'file:///etc/passwd',
"tag": "",
"method": "GET",
"fetch_backend": "html_requests",
"body": ""},
follow_redirects=True
)
time.sleep(3)
res = client.get(
url_for("index", uuid="first"),
follow_redirects=True
)
assert b'denied for security reasons' in res.data

View File

@@ -42,7 +42,6 @@ class update_worker(threading.Thread):
now = time.time()
try:
changed_detected, update_obj, contents = update_handler.run(uuid)
# Re #342
@@ -50,8 +49,6 @@ class update_worker(threading.Thread):
# We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(e))
except content_fetcher.EmptyReply as e:

View File

@@ -2,7 +2,7 @@ version: '2'
services:
changedetection:
image: ghcr.io/dgtlmoon/changedetection.io
container_name: changedetection.io
container_name: changedetection
hostname: changedetection
volumes:
- changedetection-data:/datastore

View File

@@ -1,5 +1,5 @@
flask~= 2.0
flask_wtf
eventlet>=0.31.0
validators
timeago ~=1.0