fsecada01
diff --git a/‎TextSpitter/__pycache__/__init__.cpython-39.pyc‎
338 Bytes b/‎TextSpitter/__pycache__/__init__.cpython-39.pyc‎
338 Bytes
diff --git a/‎TextSpitter/__pycache__/core.cpython-39.pyc‎
2.87 KB b/‎TextSpitter/__pycache__/core.cpython-39.pyc‎
2.87 KB
diff --git a/‎TextSpitter/__pycache__/main.cpython-39.pyc‎
1.2 KB b/‎TextSpitter/__pycache__/main.cpython-39.pyc‎
1.2 KB
diff --git a/‎TextSpitter/core.py‎
Lines changed: 55 additions & 30 deletions b/‎TextSpitter/core.py‎
Lines changed: 55 additions & 30 deletions
diff --git a/‎TextSpitter/main.py‎
Lines changed: 6 additions & 5 deletions b/‎TextSpitter/main.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎core_requirements.txt‎
Lines changed: 3 additions & 3 deletions b/‎core_requirements.txt‎
Lines changed: 3 additions & 3 deletions
@@ -1,47 +1,72 @@
 import mimetypes
 from docx import Document
 from io import BytesIO
+from typing import BinaryIO
 
 
-def PdfFileRead(file):
-    """This current code provides a workaround in case MuPDF (a dependency for
-    PyMuPDF) is not usable in the development environment. For such instances,
-    the module relies on PyPDF2 to extract text data. However, because of the
-    likelihood of white spaces being rampant in the extracted string data,
-    those characters get filtered out."""
+class FileExtractor:
+    """
+    Wrapper for extracting file contents to string
+    """
 
-    try:
-        import fitz
+    def __init__(self, file: str or BinaryIO):
+        if type(file) == BinaryIO:
+            self.file = file
+            self.name = file.name
+        elif type(file) == str:
+            self.file = None
+            self.name = file
 
-        with file.open() as f:
-            pdf_file = fitz.Document(stream=f.read(), filetype="pdf")
+    @staticmethod
+    def get_file_type(file):
+        mime_type = mimetypes.guess_type(file)[0]
+        guess_file_type = mime_type.split("/")[1]
+        return guess_file_type
+
+    def get_contents(self):
+        write_mode = "rb+"
+        if ".txt" in self.name:
+            write_mode = "r+"
+        if self.file:
+            file = self.file
+            with file.open() as f:
+                contents = f.read()
+        else:
+            with open(self.name, write_mode) as f:
+                contents = f.read()
+        return contents
+
+    def PdfFileRead(self):
+        """This current code provides a workaround in case MuPDF (a dependency for
+        PyMuPDF) is not usable in the development environment. For such instances,
+        the module relies on PyPDF2 to extract text data. However, because of the
+        likelihood of white spaces being rampant in the extracted string data,
+        those characters get filtered out."""
+
+        contents = self.get_contents()
+
+        try:
+            import fitz
+
+            pdf_file = fitz.Document(stream=contents, filetype="pdf")
             raw_text = [ele.getText("text") for ele in pdf_file]
             text = "".join(raw_text)
-    # else:
-    except Exception:
-        import PyPDF2
+        # else:
+        except Exception:
+            import PyPDF2
 
-        with open(file, "rb") as f:
-            pdf_reader = PyPDF2.PdfFileReader(f)
+            pdf_reader = PyPDF2.PdfFileReader(contents)
             raw_text = [ele.extractText() for ele in pdf_reader.pages]
             text = "".join(raw_text)
-    return text
+        return text
 
-
-def DocxFileRead(file):
-    with file.open() as f:
-        f_stream = BytesIO(f.read())
+    def DocxFileRead(self):
+        contents = self.get_contents()
+        f_stream = BytesIO(contents)
         document = Document(f_stream)
         raw_text = [p.text for p in document.paragraphs]
         text = "\n".join(raw_text)
-    return text
-
-
-def TextFileRead(file):
-    return open(file, "r").read()
-
+        return text
 
-def get_file_type(file):
-    mime_type = mimetypes.guess_type(file)[0]
-    guess_file_type = mime_type.split("/")[1]
-    return guess_file_type
+    def TextFileRead(self):
+        return self.get_contents()
@@ -1,10 +1,11 @@
-from .core import *
+from .core import FileExtractor
+import mimetypes
 
 
 class WordLoader:
     def __init__(self, file):
         self.name = file
-        self.text = str(file)
+        self.extractor = FileExtractor(file)
 
     def file_load(self):
         file_loc = self.name
@@ -15,13 +16,13 @@ def file_load(self):
         file_types_tup = ("pdf", "docx", "txt", "text")
         if file_type in file_types_tup:
             if file_type == file_types_tup[0]:
-                text = PdfFileRead(self.name)
+                text = self.extractor.PdfFileRead()
             elif file_type == file_types_tup[1]:
-                text = DocxFileRead(self.name)
+                text = self.extractor.DocxFileRead()
             # elif file_type == file_types_tup[2]:
             #     text = DocFileRead(self.text)
             else:
-                text = TextFileRead(self.text)
+                text = self.extractor.TextFileRead()
             return text
         else:
             mime_type = mimetypes.guess_type(file_loc)
 
@@ -1,14 +1,14 @@
 #
-# This file is autogenerated by pip-compile
+# This file is autogenerated by pip-compile with python 3.9
 # To update, run:
 #
 #    pip-compile core_requirements.in
 #
 lxml==4.6.3
     # via python-docx
-pymupdf==1.18.12
+pymupdf==1.18.19
     # via -r core_requirements.in
 pypdf2==1.26.0
     # via -r core_requirements.in
-python-docx==0.8.10
+python-docx==0.8.11
     # via -r core_requirements.in
Original file line number	Diff line number	Diff line change
`@@ -1,14 +1,14 @@`
`1`	`1`	`#`
`2`		`-# This file is autogenerated by pip-compile`
	`2`	`+# This file is autogenerated by pip-compile with python 3.9`
`3`	`3`	`# To update, run:`
`4`	`4`	`#`
`5`	`5`	`# pip-compile core_requirements.in`
`6`	`6`	`#`
`7`	`7`	`lxml==4.6.3`
`8`	`8`	`# via python-docx`
`9`		`-pymupdf==1.18.12`
	`9`	`+pymupdf==1.18.19`
`10`	`10`	`# via -r core_requirements.in`
`11`	`11`	`pypdf2==1.26.0`
`12`	`12`	`# via -r core_requirements.in`
`13`		`-python-docx==0.8.10`
	`13`	`+python-docx==0.8.11`
`14`	`14`	`# via -r core_requirements.in`