PDF File change detection - Initial PDF fetcher support with basic text extraction (#1244)

This commit is contained in:
dgtlmoon
2022-12-19 17:51:41 +01:00
committed by GitHub
parent e8e176f3bd
commit 13c4121f52
10 changed files with 143 additions and 15 deletions

View File

@@ -16,6 +16,10 @@ class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
class PDFToHTMLToolNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
# Some common stuff here that can be moved to a base class
# (set_proxy_from_list)
@@ -87,7 +91,7 @@ class perform_site_check():
is_source = True
# Pluggable content fetcher
prefer_backend = watch.get('fetch_backend')
prefer_backend = watch.get_fetch_backend
if hasattr(content_fetcher, prefer_backend):
klass = getattr(content_fetcher, prefer_backend)
else:
@@ -117,12 +121,18 @@ class perform_site_check():
if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip():
fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code')
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'))
# requests for PDF's, images etc should be passwd the is_binary flag
is_binary = watch.is_pdf
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
fetcher.quit()
self.screenshot = fetcher.screenshot
self.xpath_data = fetcher.xpath_data
# Track the content type
update_obj['content_type'] = fetcher.headers.get('Content-Type', '')
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest()
@@ -149,6 +159,31 @@ class perform_site_check():
is_html = False
is_json = False
if watch.is_pdf or 'application/pdf' in fetcher.headers.get('Content-Type', '').lower():
from shutil import which
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
if not which(tool):
raise PDFToHTMLToolNotFound("Command-line `{}` tool was not found in system PATH, was it installed?".format(tool))
import subprocess
proc = subprocess.Popen(
[tool, '-stdout', '-', '-s', 'out.pdf', '-i'],
stdout=subprocess.PIPE,
stdin=subprocess.PIPE)
proc.stdin.write(fetcher.raw_content)
proc.stdin.close()
fetcher.content = proc.stdout.read().decode('utf-8')
proc.wait(timeout=60)
# Add a little metadata so we know if the file changes (like if an image changes, but the text is the same
# @todo may cause problems with non-UTF8?
metadata = "<p>Added by changedetection.io: Document checksum - {} Filesize - {} bytes</p>".format(
hashlib.md5(fetcher.raw_content).hexdigest().upper(),
len(fetcher.content))
fetcher.content = fetcher.content.replace('</body>', metadata + '</body>')
include_filters_rule = deepcopy(watch.get('include_filters', []))
# include_filters_rule = watch['include_filters']
subtractive_selectors = watch.get(