PDF File change detection - Initial PDF fetcher support with basic text extraction (#1244)

2022-12-19 17:51:41 +01:00
parent e8e176f3bd
commit 13c4121f52
10 changed files with 143 additions and 15 deletions
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -114,6 +114,24 @@ class model(dict):

        return ready_url

+    @property
+    def get_fetch_backend(self):
+        """
+        Like just using the `fetch_backend` key but there could be some logic
+        :return:
+        """
+        # Maybe also if is_image etc?
+        # This is because chrome/playwright wont render the PDF in the browser and we will just fetch it and use pdf2html to see the text.
+        if self.is_pdf:
+            return 'html_requests'
+
+        return self.get('fetch_backend')
+
+    @property
+    def is_pdf(self):
+        # content_type field is set in the future
+        return '.pdf' in self.get('url', '').lower() or 'pdf' in self.get('content_type', '').lower()
+
    @property
    def label(self):
        # Used for sorting