Distill.io JSON export file importer (#592)
This commit is contained in:
@@ -683,44 +683,37 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
@app.route("/import", methods=['GET', "POST"])
|
||||
@login_required
|
||||
def import_page():
|
||||
import validators
|
||||
remaining_urls = []
|
||||
|
||||
good = 0
|
||||
|
||||
if request.method == 'POST':
|
||||
now=time.time()
|
||||
urls = request.values.get('urls').split("\n")
|
||||
from .importer import import_url_list, import_distill_io_json
|
||||
|
||||
if (len(urls) > 5000):
|
||||
flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
|
||||
# URL List import
|
||||
if request.values.get('urls') and len(request.values.get('urls').strip()):
|
||||
# Import and push into the queue for immediate update check
|
||||
importer = import_url_list()
|
||||
importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
|
||||
for uuid in importer.new_uuids:
|
||||
update_q.put(uuid)
|
||||
|
||||
for url in urls:
|
||||
url = url.strip()
|
||||
url, *tags = url.split(" ")
|
||||
# Flask wtform validators wont work with basic auth, use validators package
|
||||
# Up to 5000 per batch so we dont flood the server
|
||||
if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
|
||||
new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
|
||||
if new_uuid:
|
||||
# Straight into the queue.
|
||||
update_q.put(new_uuid)
|
||||
good += 1
|
||||
continue
|
||||
if len(importer.remaining_data) == 0:
|
||||
return redirect(url_for('index'))
|
||||
else:
|
||||
remaining_urls = importer.remaining_data
|
||||
|
||||
if len(url.strip()):
|
||||
remaining_urls.append(url)
|
||||
# Distill.io import
|
||||
if request.values.get('distill-io') and len(request.values.get('distill-io').strip()):
|
||||
# Import and push into the queue for immediate update check
|
||||
d_importer = import_distill_io_json()
|
||||
d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
|
||||
for uuid in d_importer.new_uuids:
|
||||
update_q.put(uuid)
|
||||
|
||||
flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
|
||||
datastore.needs_write = True
|
||||
|
||||
if len(remaining_urls) == 0:
|
||||
# Looking good, redirect to index.
|
||||
return redirect(url_for('index'))
|
||||
|
||||
# Could be some remaining, or we could be on GET
|
||||
output = render_template("import.html",
|
||||
remaining="\n".join(remaining_urls)
|
||||
import_url_list_remaining="\n".join(remaining_urls),
|
||||
original_distill_json=''
|
||||
)
|
||||
return output
|
||||
|
||||
|
||||
@@ -17,10 +17,10 @@ class perform_site_check():
|
||||
self.datastore = datastore
|
||||
|
||||
# If there was a proxy list enabled, figure out what proxy_args/which proxy to use
|
||||
# if watch.proxy use that
|
||||
# fetcher.proxy_override = watch.proxy or main config proxy
|
||||
# Allows override the proxy on a per-request basis
|
||||
# ALWAYS use the first one is nothing selected
|
||||
# if watch.proxy use that
|
||||
# fetcher.proxy_override = watch.proxy or main config proxy
|
||||
# Allows override the proxy on a per-request basis
|
||||
# ALWAYS use the first one is nothing selected
|
||||
|
||||
def set_proxy_from_list(self, watch):
|
||||
proxy_args = None
|
||||
@@ -149,11 +149,13 @@ class perform_site_check():
|
||||
# Then we assume HTML
|
||||
if has_filter_rule:
|
||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||
if css_filter_rule[0] == '/':
|
||||
html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
|
||||
if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
|
||||
html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
|
||||
html_content=fetcher.content)
|
||||
else:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
|
||||
if has_subtractive_selectors:
|
||||
html_content = html_tools.element_removal(subtractive_selectors, html_content)
|
||||
|
||||
@@ -173,7 +175,6 @@ class perform_site_check():
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
||||
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
|
||||
@@ -224,4 +225,4 @@ class perform_site_check():
|
||||
if not watch['title'] or not len(watch['title']):
|
||||
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
|
||||
|
||||
return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
|
||||
return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
|
||||
|
||||
133
changedetectionio/importer.py
Normal file
133
changedetectionio/importer.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import time
|
||||
import validators
|
||||
|
||||
|
||||
class Importer():
|
||||
remaining_data = []
|
||||
new_uuids = []
|
||||
good = 0
|
||||
|
||||
def __init__(self):
|
||||
self.new_uuids = []
|
||||
self.good = 0
|
||||
self.remaining_data = []
|
||||
|
||||
@abstractmethod
|
||||
def run(self,
|
||||
data,
|
||||
flash,
|
||||
datastore):
|
||||
pass
|
||||
|
||||
|
||||
class import_url_list(Importer):
|
||||
"""
|
||||
Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
|
||||
"""
|
||||
def run(self,
|
||||
data,
|
||||
flash,
|
||||
datastore,
|
||||
):
|
||||
|
||||
urls = data.split("\n")
|
||||
good = 0
|
||||
now = time.time()
|
||||
|
||||
if (len(urls) > 5000):
|
||||
flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
|
||||
|
||||
for url in urls:
|
||||
url = url.strip()
|
||||
if not len(url):
|
||||
continue
|
||||
|
||||
tags = ""
|
||||
|
||||
# 'tags' should be a csv list after the URL
|
||||
if ' ' in url:
|
||||
url, tags = url.split(" ", 1)
|
||||
|
||||
# Flask wtform validators wont work with basic auth, use validators package
|
||||
# Up to 5000 per batch so we dont flood the server
|
||||
if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
|
||||
new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
|
||||
if new_uuid:
|
||||
# Straight into the queue.
|
||||
self.new_uuids.append(new_uuid)
|
||||
good += 1
|
||||
continue
|
||||
|
||||
# Worked past the 'continue' above, append it to the bad list
|
||||
if self.remaining_data is None:
|
||||
self.remaining_data = []
|
||||
self.remaining_data.append(url)
|
||||
|
||||
flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
|
||||
|
||||
|
||||
class import_distill_io_json(Importer):
|
||||
def run(self,
|
||||
data,
|
||||
flash,
|
||||
datastore,
|
||||
):
|
||||
|
||||
import json
|
||||
good = 0
|
||||
now = time.time()
|
||||
self.new_uuids=[]
|
||||
|
||||
|
||||
try:
|
||||
data = json.loads(data.strip())
|
||||
except json.decoder.JSONDecodeError:
|
||||
flash("Unable to read JSON file, was it broken?", 'error')
|
||||
return
|
||||
|
||||
if not data.get('data'):
|
||||
flash("JSON structure looks invalid, was it broken?", 'error')
|
||||
return
|
||||
|
||||
for d in data.get('data'):
|
||||
d_config = json.loads(d['config'])
|
||||
extras = {'title': d['name']}
|
||||
|
||||
if len(d['uri']) and good < 5000:
|
||||
try:
|
||||
# @todo we only support CSS ones at the moment
|
||||
if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
|
||||
extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
|
||||
except KeyError:
|
||||
pass
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
try:
|
||||
extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
|
||||
if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
|
||||
extras['css_filter'] = 'xpath:' + extras['css_filter']
|
||||
|
||||
except KeyError:
|
||||
pass
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
try:
|
||||
extras['tag'] = ", ".join(d['tags'])
|
||||
except KeyError:
|
||||
pass
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
new_uuid = datastore.add_watch(url=d['uri'].strip(),
|
||||
extras=extras,
|
||||
write_to_disk_now=False)
|
||||
|
||||
if new_uuid:
|
||||
# Straight into the queue.
|
||||
self.new_uuids.append(new_uuid)
|
||||
good += 1
|
||||
|
||||
flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
|
||||
@@ -131,7 +131,7 @@ User-Agent: wonderbra 1.0") }}
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
|
||||
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code>, <a
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
|
||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||
</ul>
|
||||
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
|
||||
|
||||
@@ -1,30 +1,86 @@
|
||||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<div class="edit-form">
|
||||
<div class="inner">
|
||||
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
|
||||
<div class="edit-form monospaced-textarea">
|
||||
|
||||
<div class="tabs collapsable">
|
||||
<ul>
|
||||
<li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
|
||||
<li class="tab"><a href="#distill-io">Distill.io</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="box-wrap inner">
|
||||
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
|
||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
|
||||
<fieldset class="pure-group">
|
||||
<legend>
|
||||
Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
|
||||
<br>
|
||||
<code>https://example.com tag1, tag2, last tag</code>
|
||||
<br>
|
||||
URLs which do not pass validation will stay in the textarea.
|
||||
</legend>
|
||||
|
||||
<div class="tab-pane-inner" id="url-list">
|
||||
<fieldset class="pure-group">
|
||||
<legend>
|
||||
Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
|
||||
(,):
|
||||
<br>
|
||||
<code>https://example.com tag1, tag2, last tag</code>
|
||||
<br>
|
||||
URLs which do not pass validation will stay in the textarea.
|
||||
</legend>
|
||||
|
||||
<textarea name="urls" class="pure-input-1-2" placeholder="https://"
|
||||
style="width: 100%;
|
||||
|
||||
<textarea name="urls" class="pure-input-1-2" placeholder="https://"
|
||||
style="width: 100%;
|
||||
font-family:monospace;
|
||||
white-space: pre;
|
||||
overflow-wrap: normal;
|
||||
overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
|
||||
</fieldset>
|
||||
overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
|
||||
</fieldset>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<div class="tab-pane-inner" id="distill-io">
|
||||
|
||||
|
||||
<fieldset class="pure-group">
|
||||
<legend>
|
||||
Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
|
||||
This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
|
||||
<br/>
|
||||
<p>
|
||||
How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
|
||||
Be sure to set your default fetcher to Chrome if required.</br>
|
||||
</p>
|
||||
</legend>
|
||||
|
||||
|
||||
<textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
|
||||
font-family:monospace;
|
||||
white-space: pre;
|
||||
overflow-wrap: normal;
|
||||
overflow-x: scroll;" placeholder="Example Distill.io JSON export file
|
||||
|
||||
{
|
||||
"client": {
|
||||
"local": 1
|
||||
},
|
||||
"data": [
|
||||
{
|
||||
"name": "Unraid | News",
|
||||
"uri": "https://unraid.net/blog",
|
||||
"config": "{\"selections\":[{\"frames\":[{\"index\":0,\"excludes\":[],\"includes\":[{\"type\":\"xpath\",\"expr\":\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\"}]}],\"dynamic\":true,\"delay\":2}],\"ignoreEmptyText\":true,\"includeStyle\":false,\"dataAttr\":\"text\"}",
|
||||
"tags": [],
|
||||
"content_type": 2,
|
||||
"state": 40,
|
||||
"schedule": "{\"type\":\"INTERVAL\",\"params\":{\"interval\":4447}}",
|
||||
"ts": "2022-03-27T15:51:15.667Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
" rows="25">{{ original_distill_json }}</textarea>
|
||||
</fieldset>
|
||||
</div>
|
||||
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
|
||||
@@ -5,18 +5,17 @@ import time
|
||||
from flask import url_for
|
||||
|
||||
from .util import live_server_setup
|
||||
|
||||
|
||||
def test_import(client, live_server):
|
||||
|
||||
def test_setup(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
def test_import(client, live_server):
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={
|
||||
"distill-io": "",
|
||||
"urls": """https://example.com
|
||||
https://example.com tag1
|
||||
https://example.com tag1, other tag"""
|
||||
@@ -26,3 +25,96 @@ https://example.com tag1, other tag"""
|
||||
assert b"3 Imported" in res.data
|
||||
assert b"tag1" in res.data
|
||||
assert b"other tag" in res.data
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
# Clear flask alerts
|
||||
res = client.get( url_for("index"))
|
||||
res = client.get( url_for("index"))
|
||||
|
||||
def xtest_import_skip_url(client, live_server):
|
||||
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={
|
||||
"distill-io": "",
|
||||
"urls": """https://example.com
|
||||
:ht000000broken
|
||||
"""
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
assert b"ht000000broken" in res.data
|
||||
assert b"1 Skipped" in res.data
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
# Clear flask alerts
|
||||
res = client.get( url_for("index"))
|
||||
|
||||
def test_import_distillio(client, live_server):
|
||||
|
||||
distill_data='''
|
||||
{
|
||||
"client": {
|
||||
"local": 1
|
||||
},
|
||||
"data": [
|
||||
{
|
||||
"name": "Unraid | News",
|
||||
"uri": "https://unraid.net/blog",
|
||||
"config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}",
|
||||
"tags": ["nice stuff", "nerd-news"],
|
||||
"content_type": 2,
|
||||
"state": 40,
|
||||
"schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}",
|
||||
"ts": "2022-03-27T15:51:15.667Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={
|
||||
"distill-io": distill_data,
|
||||
"urls" : ''
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
|
||||
assert b"Unable to read JSON file, was it broken?" not in res.data
|
||||
assert b"1 Imported from Distill.io" in res.data
|
||||
|
||||
res = client.get( url_for("edit_page", uuid="first"))
|
||||
|
||||
assert b"https://unraid.net/blog" in res.data
|
||||
assert b"Unraid | News" in res.data
|
||||
|
||||
|
||||
# flask/wtforms should recode this, check we see it
|
||||
# wtforms encodes it like id=' ,but html.escape makes it like id='
|
||||
# - so just check it manually :(
|
||||
#import json
|
||||
#import html
|
||||
#d = json.loads(distill_data)
|
||||
# embedded_d=json.loads(d['data'][0]['config'])
|
||||
# x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8')
|
||||
assert b"xpath:(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]" in res.data
|
||||
|
||||
# did the tags work?
|
||||
res = client.get( url_for("index"))
|
||||
|
||||
assert b"nice stuff" in res.data
|
||||
assert b"nerd-news" in res.data
|
||||
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
# Clear flask alerts
|
||||
res = client.get(url_for("index"))
|
||||
|
||||
@@ -116,4 +116,46 @@ def test_xpath_validation(client, live_server):
|
||||
data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"is not a valid XPath expression" in res.data
|
||||
assert b"is not a valid XPath expression" in res.data
|
||||
|
||||
|
||||
# actually only really used by the distll.io importer, but could be handy too
|
||||
def test_check_with_prefix_css_filter(client, live_server):
|
||||
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
set_original_response()
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(3)
|
||||
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Updated watch." in res.data
|
||||
time.sleep(3)
|
||||
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
with open('/tmp/fuck.html', 'wb') as f:
|
||||
f.write(res.data)
|
||||
assert b"Some text thats the same" in res.data #in selector
|
||||
assert b"Some text that will change" not in res.data #not in selector
|
||||
|
||||
client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
Reference in New Issue
Block a user