implemented fix: wrap all file names in FileIO stream to normalize Python File behavior across file_loc and file IO types; increase version to 0.3.5a3.

fsecada01 · fsecada01 · commit f50f3e667b40 · 2021-10-09T23:41:02.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ v */
 git_push.bat
 setup_build_script.bat
 share/
+__pycache__
diff --git a/TextSpitter/__init__.py b/TextSpitter/__init__.py
@@ -1,8 +1,8 @@
 from .main import WordLoader
 
 
-name = 'TextSpitter'
+name = "TextSpitter"
 
 
-def TextSpitter(file_path):
-    return WordLoader(file_path).file_load()
+def TextSpitter(file_obj=None, filename: str or None = None):
+    return WordLoader(file_obj=file_obj, filename=filename).file_load()
diff --git a/TextSpitter/__pycache__/__init__.cpython-39.pyc b/TextSpitter/__pycache__/__init__.cpython-39.pyc
diff --git a/TextSpitter/__pycache__/core.cpython-39.pyc b/TextSpitter/__pycache__/core.cpython-39.pyc
diff --git a/TextSpitter/__pycache__/main.cpython-39.pyc b/TextSpitter/__pycache__/main.cpython-39.pyc
diff --git a/TextSpitter/core.py b/TextSpitter/core.py
@@ -1,40 +1,41 @@
 import mimetypes
 from docx import Document
-from io import BytesIO
-from typing import BinaryIO
+from io import FileIO, BytesIO
+from typing import IO
 
 
 class FileExtractor:
     """
     Wrapper for extracting file contents to string
     """
 
-    def __init__(self, file: str or BinaryIO):
-        if type(file) == BinaryIO:
-            self.file = file
-            self.name = file.name
-        elif type(file) == str:
-            self.file = None
-            self.name = file
+    def __init__(
+        self,
+        file_obj=None,
+        filename: str or None = None,
+    ):
+        """
+        The extractor wrapper will initialize by assinging the filename to the object's file property; if a file-like object is provided instead of a name, then a file_ext arg will be required.
+        """
+        if filename:
+            self.file = FileIO(filename)
+            self.file_ext = filename.split(".")[-1]
+        else:
+            if hasattr(file_obj, "name"):
+                self.file = file_obj.name
+                self.file_ext = file_obj.name.split(".")[-1]
+            else:
+                raise Exception(
+                    "Your file object does not contain a name attribute. Please add a name attribute with a file extension, and try again. Need the file ext. data for mime-typing."
+                )
 
     @staticmethod
     def get_file_type(file):
         mime_type = mimetypes.guess_type(file)[0]
-        guess_file_type = mime_type.split("/")[1]
-        return guess_file_type
+        return mime_type.split("/")[1]
 
     def get_contents(self):
-        write_mode = "rb+"
-        if ".txt" in self.name:
-            write_mode = "r+"
-        if self.file:
-            file = self.file
-            with file.open() as f:
-                contents = f.read()
-        else:
-            with open(self.name, write_mode) as f:
-                contents = f.read()
-        return contents
+        return self.file.read()
 
     def PdfFileRead(self):
         """This current code provides a workaround in case MuPDF (a dependency for
diff --git a/TextSpitter/main.py b/TextSpitter/main.py
@@ -1,31 +1,30 @@
 from .core import FileExtractor
+from typing import IO
 import mimetypes
 
 
 class WordLoader:
-    def __init__(self, file):
-        self.name = file
-        self.extractor = FileExtractor(file)
+    def __init__(self, file_obj=None, filename: str or None = None):
+        self.file = FileExtractor(file_obj, filename)
 
     def file_load(self):
-        file_loc = self.name
-        file_type = file_loc.split(".")[-1]
+        file_type = self.file.file_ext
         # file_type = file_loc.split('.')[-1]
 
         # file_types_tup = ('pdf', 'docx', 'doc', 'txt', 'text')
         file_types_tup = ("pdf", "docx", "txt", "text")
         if file_type in file_types_tup:
             if file_type == file_types_tup[0]:
-                text = self.extractor.PdfFileRead()
+                text = self.file.PdfFileRead()
             elif file_type == file_types_tup[1]:
-                text = self.extractor.DocxFileRead()
+                text = self.file.DocxFileRead()
             # elif file_type == file_types_tup[2]:
             #     text = DocFileRead(self.text)
             else:
-                text = self.extractor.TextFileRead()
+                text = self.file.TextFileRead()
             return text
         else:
-            mime_type = mimetypes.guess_type(file_loc)
+            mime_type = self.file.get_file_type(self.file.name)
             print(
                 f"You are using an incorrect file format for file submissions.\n\
             Please upload a .docx/.doc/.txt/.pdf file OR!\n\
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -22,21 +22,21 @@ bleach==4.1.0
     # via
     #   nbconvert
     #   readme-renderer
-certifi==2021.5.30
+certifi==2021.10.8
     # via requests
 cffi==1.14.6
     # via argon2-cffi
 charset-normalizer==2.0.6
     # via requests
-click==8.0.1
+click==8.0.2
     # via black
 colorama==0.4.4
     # via
     #   click
     #   ipython
     #   tqdm
     #   twine
-debugpy==1.4.3
+debugpy==1.5.0
     # via ipykernel
 decorator==5.1.0
     # via ipython
@@ -72,7 +72,7 @@ ipython-genutils==0.2.0
     #   notebook
 jedi==0.18.0
     # via ipython
-jinja2==3.0.1
+jinja2==3.0.2
     # via
     #   jupyter-server
     #   jupyterlab
@@ -81,11 +81,11 @@ jinja2==3.0.1
     #   notebook
 json5==0.9.6
     # via jupyterlab-server
-jsonschema==3.2.0
+jsonschema==4.0.1
     # via
     #   jupyterlab-server
     #   nbformat
-jupyter-client==7.0.3
+jupyter-client==7.0.6
     # via
     #   ipykernel
     #   jupyter-server
@@ -99,12 +99,12 @@ jupyter-core==4.8.1
     #   nbconvert
     #   nbformat
     #   notebook
-jupyter-server==1.11.0
+jupyter-server==1.11.1
     # via
     #   jupyterlab
     #   jupyterlab-server
     #   nbclassic
-jupyterlab==3.1.13
+jupyterlab==3.1.18
     # via -r dev_requirements.in
 jupyterlab-pygments==0.1.2
     # via nbconvert
@@ -181,7 +181,7 @@ pyrsistent==0.18.0
     # via jsonschema
 python-dateutil==2.8.2
     # via jupyter-client
-pytz==2021.1
+pytz==2021.3
     # via babel
 pywin32==301
     # via jupyter-core
@@ -194,9 +194,9 @@ pyzmq==22.3.0
     #   jupyter-client
     #   jupyter-server
     #   notebook
-readme-renderer==29.0
+readme-renderer==30.0
     # via twine
-regex==2021.9.24
+regex==2021.10.8
     # via black
 requests==2.26.0
     # via
@@ -217,9 +217,7 @@ send2trash==1.8.0
 six==1.16.0
     # via
     #   bleach
-    #   jsonschema
     #   python-dateutil
-    #   readme-renderer
 sniffio==1.2.0
     # via anyio
 terminado==0.12.1
@@ -266,7 +264,7 @@ webencodings==0.5.1
     # via bleach
 websocket-client==1.2.1
     # via jupyter-server
-zipp==3.5.0
+zipp==3.6.0
     # via importlib-metadata
 
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="TextSpitter",
-    version="0.3.4",
+    version="0.3.5a3",
     author="Francis Secada",
     author_email="francis.secada@gmail.com",
     description="Python package that spits out text from your document files!",