Skip to content

Commit f50f3e6

Browse files
committed
implemented fix: wrap all file names in FileIO stream to normalize Python File behavior across file_loc and file IO types; increase version to 0.3.5a3.
1 parent e5320e5 commit f50f3e6

9 files changed

Lines changed: 48 additions & 49 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ v */
55
git_push.bat
66
setup_build_script.bat
77
share/
8+
__pycache__

TextSpitter/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from .main import WordLoader
22

33

4-
name = 'TextSpitter'
4+
name = "TextSpitter"
55

66

7-
def TextSpitter(file_path):
8-
return WordLoader(file_path).file_load()
7+
def TextSpitter(file_obj=None, filename: str or None = None):
8+
return WordLoader(file_obj=file_obj, filename=filename).file_load()
-338 Bytes
Binary file not shown.
-2.87 KB
Binary file not shown.
-1.2 KB
Binary file not shown.

TextSpitter/core.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,41 @@
11
import mimetypes
22
from docx import Document
3-
from io import BytesIO
4-
from typing import BinaryIO
3+
from io import FileIO, BytesIO
4+
from typing import IO
55

66

77
class FileExtractor:
88
"""
99
Wrapper for extracting file contents to string
1010
"""
1111

12-
def __init__(self, file: str or BinaryIO):
13-
if type(file) == BinaryIO:
14-
self.file = file
15-
self.name = file.name
16-
elif type(file) == str:
17-
self.file = None
18-
self.name = file
12+
def __init__(
13+
self,
14+
file_obj=None,
15+
filename: str or None = None,
16+
):
17+
"""
18+
The extractor wrapper will initialize by assinging the filename to the object's file property; if a file-like object is provided instead of a name, then a file_ext arg will be required.
19+
"""
20+
if filename:
21+
self.file = FileIO(filename)
22+
self.file_ext = filename.split(".")[-1]
23+
else:
24+
if hasattr(file_obj, "name"):
25+
self.file = file_obj.name
26+
self.file_ext = file_obj.name.split(".")[-1]
27+
else:
28+
raise Exception(
29+
"Your file object does not contain a name attribute. Please add a name attribute with a file extension, and try again. Need the file ext. data for mime-typing."
30+
)
1931

2032
@staticmethod
2133
def get_file_type(file):
2234
mime_type = mimetypes.guess_type(file)[0]
23-
guess_file_type = mime_type.split("/")[1]
24-
return guess_file_type
35+
return mime_type.split("/")[1]
2536

2637
def get_contents(self):
27-
write_mode = "rb+"
28-
if ".txt" in self.name:
29-
write_mode = "r+"
30-
if self.file:
31-
file = self.file
32-
with file.open() as f:
33-
contents = f.read()
34-
else:
35-
with open(self.name, write_mode) as f:
36-
contents = f.read()
37-
return contents
38+
return self.file.read()
3839

3940
def PdfFileRead(self):
4041
"""This current code provides a workaround in case MuPDF (a dependency for

TextSpitter/main.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,30 @@
11
from .core import FileExtractor
2+
from typing import IO
23
import mimetypes
34

45

56
class WordLoader:
6-
def __init__(self, file):
7-
self.name = file
8-
self.extractor = FileExtractor(file)
7+
def __init__(self, file_obj=None, filename: str or None = None):
8+
self.file = FileExtractor(file_obj, filename)
99

1010
def file_load(self):
11-
file_loc = self.name
12-
file_type = file_loc.split(".")[-1]
11+
file_type = self.file.file_ext
1312
# file_type = file_loc.split('.')[-1]
1413

1514
# file_types_tup = ('pdf', 'docx', 'doc', 'txt', 'text')
1615
file_types_tup = ("pdf", "docx", "txt", "text")
1716
if file_type in file_types_tup:
1817
if file_type == file_types_tup[0]:
19-
text = self.extractor.PdfFileRead()
18+
text = self.file.PdfFileRead()
2019
elif file_type == file_types_tup[1]:
21-
text = self.extractor.DocxFileRead()
20+
text = self.file.DocxFileRead()
2221
# elif file_type == file_types_tup[2]:
2322
# text = DocFileRead(self.text)
2423
else:
25-
text = self.extractor.TextFileRead()
24+
text = self.file.TextFileRead()
2625
return text
2726
else:
28-
mime_type = mimetypes.guess_type(file_loc)
27+
mime_type = self.file.get_file_type(self.file.name)
2928
print(
3029
f"You are using an incorrect file format for file submissions.\n\
3130
Please upload a .docx/.doc/.txt/.pdf file OR!\n\

dev_requirements.txt

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,21 @@ bleach==4.1.0
2222
# via
2323
# nbconvert
2424
# readme-renderer
25-
certifi==2021.5.30
25+
certifi==2021.10.8
2626
# via requests
2727
cffi==1.14.6
2828
# via argon2-cffi
2929
charset-normalizer==2.0.6
3030
# via requests
31-
click==8.0.1
31+
click==8.0.2
3232
# via black
3333
colorama==0.4.4
3434
# via
3535
# click
3636
# ipython
3737
# tqdm
3838
# twine
39-
debugpy==1.4.3
39+
debugpy==1.5.0
4040
# via ipykernel
4141
decorator==5.1.0
4242
# via ipython
@@ -72,7 +72,7 @@ ipython-genutils==0.2.0
7272
# notebook
7373
jedi==0.18.0
7474
# via ipython
75-
jinja2==3.0.1
75+
jinja2==3.0.2
7676
# via
7777
# jupyter-server
7878
# jupyterlab
@@ -81,11 +81,11 @@ jinja2==3.0.1
8181
# notebook
8282
json5==0.9.6
8383
# via jupyterlab-server
84-
jsonschema==3.2.0
84+
jsonschema==4.0.1
8585
# via
8686
# jupyterlab-server
8787
# nbformat
88-
jupyter-client==7.0.3
88+
jupyter-client==7.0.6
8989
# via
9090
# ipykernel
9191
# jupyter-server
@@ -99,12 +99,12 @@ jupyter-core==4.8.1
9999
# nbconvert
100100
# nbformat
101101
# notebook
102-
jupyter-server==1.11.0
102+
jupyter-server==1.11.1
103103
# via
104104
# jupyterlab
105105
# jupyterlab-server
106106
# nbclassic
107-
jupyterlab==3.1.13
107+
jupyterlab==3.1.18
108108
# via -r dev_requirements.in
109109
jupyterlab-pygments==0.1.2
110110
# via nbconvert
@@ -181,7 +181,7 @@ pyrsistent==0.18.0
181181
# via jsonschema
182182
python-dateutil==2.8.2
183183
# via jupyter-client
184-
pytz==2021.1
184+
pytz==2021.3
185185
# via babel
186186
pywin32==301
187187
# via jupyter-core
@@ -194,9 +194,9 @@ pyzmq==22.3.0
194194
# jupyter-client
195195
# jupyter-server
196196
# notebook
197-
readme-renderer==29.0
197+
readme-renderer==30.0
198198
# via twine
199-
regex==2021.9.24
199+
regex==2021.10.8
200200
# via black
201201
requests==2.26.0
202202
# via
@@ -217,9 +217,7 @@ send2trash==1.8.0
217217
six==1.16.0
218218
# via
219219
# bleach
220-
# jsonschema
221220
# python-dateutil
222-
# readme-renderer
223221
sniffio==1.2.0
224222
# via anyio
225223
terminado==0.12.1
@@ -266,7 +264,7 @@ webencodings==0.5.1
266264
# via bleach
267265
websocket-client==1.2.1
268266
# via jupyter-server
269-
zipp==3.5.0
267+
zipp==3.6.0
270268
# via importlib-metadata
271269

272270
# The following packages are considered to be unsafe in a requirements file:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="TextSpitter",
8-
version="0.3.4",
8+
version="0.3.5a3",
99
author="Francis Secada",
1010
author_email="francis.secada@gmail.com",
1111
description="Python package that spits out text from your document files!",

0 commit comments

Comments
 (0)