PDF File change detection - Initial PDF fetcher support with basic text extraction (#1244)

This commit is contained in:
dgtlmoon
2022-12-19 17:51:41 +01:00
committed by GitHub
parent e8e176f3bd
commit 13c4121f52
10 changed files with 143 additions and 15 deletions

View File

@@ -114,6 +114,24 @@ class model(dict):
return ready_url
@property
def get_fetch_backend(self):
"""
Like just using the `fetch_backend` key but there could be some logic
:return:
"""
# Maybe also if is_image etc?
# This is because chrome/playwright wont render the PDF in the browser and we will just fetch it and use pdf2html to see the text.
if self.is_pdf:
return 'html_requests'
return self.get('fetch_backend')
@property
def is_pdf(self):
# content_type field is set in the future
return '.pdf' in self.get('url', '').lower() or 'pdf' in self.get('content_type', '').lower()
@property
def label(self):
# Used for sorting