Skip to content

Commit 387ff5a

Browse files
committed
adjustments to ensure imports work
1 parent a770e08 commit 387ff5a

13 files changed

Lines changed: 352 additions & 20 deletions

File tree

.idea/TextSpitter.iml

Lines changed: 3 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

TextSpitter/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
Doc string
33
"""
44

5-
from TextSpitter.main import WordLoader
5+
import os
6+
7+
__version__ = os.environ.get("VERSION", "0.3.7")
8+
9+
from .main import WordLoader
610

711
name = "TextSpitter"
812

TextSpitter/core.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,9 @@ def get_contents(self):
7474
Returns:
7575
str | int | bytes
7676
"""
77-
with self.file as f:
78-
f.seek(0, 0)
77+
mime_type = self.get_file_type(self.file)
78+
open_mode = "r" if "text" in mime_type else "rb+"
79+
with self.file.open(open_mode) as f:
7980
return f.read()
8081

8182
def pdf_file_read(self):
@@ -125,5 +126,5 @@ def text_file_read(self):
125126
Returns:
126127
str
127128
"""
128-
with open(self.file) as f:
129+
with self.file.open() as f:
129130
return f.read()

TextSpitter/main.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""
2-
Doc String
2+
The main application to host the `WordLoader` object.
33
"""
44

55
from pathlib import Path
66
from typing import IO
77

8-
from TextSpitter.core import FileExtractor
9-
from TextSpitter.logging import logger
8+
from .core import FileExtractor
9+
from .logger import logger
1010

1111

1212
class WordLoader:
@@ -50,7 +50,8 @@ def file_load(self):
5050
text = self.file.text_file_read()
5151
return text
5252
else:
53-
mime_type = self.file.get_file_type(self.file.name)
53+
mime_type = self.file.get_file_type(self.file.file.name)
54+
5455
logger.error(
5556
f"You are using an incorrect file format for file submissions. "
5657
f"Please upload a .docx/.doc/.txt/.pdf file OR! Note the "

core_requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
loguru
22
PyMuPDF
33
PyPDF2
4-
python-docx
4+
python-docx

pyproject.toml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,12 @@ line_length = 80
5959
#force_grid_wrap = 0
6060

6161
[build-system]
62-
requires = ["setuptools>=61.0"]
62+
requires = ["setuptools>=61.0", "build", "python-docx", "loguru"]
6363
build-backend = "setuptools.build_meta"
6464

6565
[project]
6666
name = "textspitter_fsecada01"
67-
dynamic = ["dependencies", "version"]
67+
dynamic = ["dependencies", "optional-dependencies", "version"]
6868
authors = [
6969
{ name="Francis Secada", email="francis.secada@gmail.com" },
7070
]
@@ -77,12 +77,17 @@ classifiers = [
7777
"Operating System :: OS Independent",
7878
]
7979

80+
[tool.setuptools]
81+
#py-modules = ["TextSpitter"]
82+
packages = ["TextSpitter"]
83+
#package-dir = {"" = "TextSpitter"}
84+
8085
[tool.setuptools.dynamic]
81-
dependencies = {file = ['core_requirements.txt']}
86+
dependencies = {file = ["core_requirements.txt"]}
8287
optional-dependencies = {dev = { file = ["dev_requirements.txt"] }}
8388
version = {attr = "TextSpitter.__version__"}
8489
readme = {file = ['README.md']}
8590

8691
[project.urls]
8792
Homepage = "https://github.com/fsecada01/TextSpitter"
88-
Issues = "https://github.com/fsecada01/TextSpitter/issues"
93+
Issues = "https://github.com/fsecada01/TextSpitter/issues"

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
with open("README.md", "r") as fh:
44
long_description = fh.read()
55

6+
67
setuptools.setup(
78
name="TextSpitter",
89
author="Francis Secada",
@@ -11,7 +12,6 @@
1112
long_description=long_description,
1213
long_description_content_type="text/markdown",
1314
url="https://github.com/fsecada01/TextSpitter",
14-
packages=setuptools.find_packages(),
1515
classifiers=[
1616
"Programming Language :: Python :: 3",
1717
"License :: OSI Approved :: MIT License",
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
Metadata-Version: 2.1
2+
Name: textspitter_fsecada01
3+
Version: 0.3.7
4+
Summary: A text-extraction application that facilitates string consumption.
5+
Home-page: https://github.com/fsecada01/TextSpitter
6+
Author: Francis Secada
7+
Author-email: Francis Secada <francis.secada@gmail.com>
8+
Project-URL: Homepage, https://github.com/fsecada01/TextSpitter
9+
Project-URL: Issues, https://github.com/fsecada01/TextSpitter/issues
10+
Classifier: Programming Language :: Python :: 3
11+
Classifier: License :: OSI Approved :: MIT License
12+
Classifier: Operating System :: OS Independent
13+
Requires-Python: >=3.8
14+
Description-Content-Type: text/markdown
15+
License-File: LICENSE
16+
Requires-Dist: colorama==0.4.6
17+
Requires-Dist: loguru==0.7.2
18+
Requires-Dist: lxml==5.3.0
19+
Requires-Dist: pymupdf==1.24.10
20+
Requires-Dist: pymupdfb==1.24.10
21+
Requires-Dist: pypdf2==3.0.1
22+
Requires-Dist: python-docx==1.1.2
23+
Requires-Dist: typing-extensions==4.12.2
24+
Requires-Dist: win32-setctime==1.1.0
25+
Provides-Extra: dev
26+
Requires-Dist: anyio==4.5.0; extra == "dev"
27+
Requires-Dist: argon2-cffi==23.1.0; extra == "dev"
28+
Requires-Dist: argon2-cffi-bindings==21.2.0; extra == "dev"
29+
Requires-Dist: arrow==1.3.0; extra == "dev"
30+
Requires-Dist: asttokens==2.4.1; extra == "dev"
31+
Requires-Dist: async-lru==2.0.4; extra == "dev"
32+
Requires-Dist: attrs==24.2.0; extra == "dev"
33+
Requires-Dist: babel==2.16.0; extra == "dev"
34+
Requires-Dist: beautifulsoup4==4.12.3; extra == "dev"
35+
Requires-Dist: black==24.8.0; extra == "dev"
36+
Requires-Dist: bleach==6.1.0; extra == "dev"
37+
Requires-Dist: certifi==2024.8.30; extra == "dev"
38+
Requires-Dist: cffi==1.17.1; extra == "dev"
39+
Requires-Dist: charset-normalizer==3.3.2; extra == "dev"
40+
Requires-Dist: click==8.1.7; extra == "dev"
41+
Requires-Dist: colorama==0.4.6; extra == "dev"
42+
Requires-Dist: comm==0.2.2; extra == "dev"
43+
Requires-Dist: debugpy==1.8.5; extra == "dev"
44+
Requires-Dist: decorator==5.1.1; extra == "dev"
45+
Requires-Dist: defusedxml==0.7.1; extra == "dev"
46+
Requires-Dist: docutils==0.21.2; extra == "dev"
47+
Requires-Dist: executing==2.1.0; extra == "dev"
48+
Requires-Dist: fastjsonschema==2.20.0; extra == "dev"
49+
Requires-Dist: fqdn==1.5.1; extra == "dev"
50+
Requires-Dist: h11==0.14.0; extra == "dev"
51+
Requires-Dist: httpcore==1.0.5; extra == "dev"
52+
Requires-Dist: httpx==0.27.2; extra == "dev"
53+
Requires-Dist: idna==3.10; extra == "dev"
54+
Requires-Dist: importlib-metadata==8.5.0; extra == "dev"
55+
Requires-Dist: ipykernel==6.29.5; extra == "dev"
56+
Requires-Dist: ipython==8.27.0; extra == "dev"
57+
Requires-Dist: isoduration==20.11.0; extra == "dev"
58+
Requires-Dist: isort==5.13.2; extra == "dev"
59+
Requires-Dist: jaraco-classes==3.4.0; extra == "dev"
60+
Requires-Dist: jaraco-context==6.0.1; extra == "dev"
61+
Requires-Dist: jaraco-functools==4.0.2; extra == "dev"
62+
Requires-Dist: jedi==0.19.1; extra == "dev"
63+
Requires-Dist: jinja2==3.1.4; extra == "dev"
64+
Requires-Dist: json5==0.9.25; extra == "dev"
65+
Requires-Dist: jsonpointer==3.0.0; extra == "dev"
66+
Requires-Dist: jsonschema[format-nongpl]==4.23.0; extra == "dev"
67+
Requires-Dist: jsonschema-specifications==2023.12.1; extra == "dev"
68+
Requires-Dist: jupyter-client==8.6.3; extra == "dev"
69+
Requires-Dist: jupyter-core==5.7.2; extra == "dev"
70+
Requires-Dist: jupyter-events==0.10.0; extra == "dev"
71+
Requires-Dist: jupyter-lsp==2.2.5; extra == "dev"
72+
Requires-Dist: jupyter-server==2.14.2; extra == "dev"
73+
Requires-Dist: jupyter-server-terminals==0.5.3; extra == "dev"
74+
Requires-Dist: jupyterlab==4.2.5; extra == "dev"
75+
Requires-Dist: jupyterlab-code-formatter==3.0.2; extra == "dev"
76+
Requires-Dist: jupyterlab-pygments==0.3.0; extra == "dev"
77+
Requires-Dist: jupyterlab-server==2.27.3; extra == "dev"
78+
Requires-Dist: keyring==25.4.1; extra == "dev"
79+
Requires-Dist: markdown-it-py==3.0.0; extra == "dev"
80+
Requires-Dist: markupsafe==2.1.5; extra == "dev"
81+
Requires-Dist: matplotlib-inline==0.1.7; extra == "dev"
82+
Requires-Dist: mdurl==0.1.2; extra == "dev"
83+
Requires-Dist: mistune==3.0.2; extra == "dev"
84+
Requires-Dist: more-itertools==10.5.0; extra == "dev"
85+
Requires-Dist: mypy-extensions==1.0.0; extra == "dev"
86+
Requires-Dist: nbclient==0.10.0; extra == "dev"
87+
Requires-Dist: nbconvert==7.16.4; extra == "dev"
88+
Requires-Dist: nbformat==5.10.4; extra == "dev"
89+
Requires-Dist: nest-asyncio==1.6.0; extra == "dev"
90+
Requires-Dist: nh3==0.2.18; extra == "dev"
91+
Requires-Dist: notebook-shim==0.2.4; extra == "dev"
92+
Requires-Dist: overrides==7.7.0; extra == "dev"
93+
Requires-Dist: packaging==24.1; extra == "dev"
94+
Requires-Dist: pandocfilters==1.5.1; extra == "dev"
95+
Requires-Dist: parso==0.8.4; extra == "dev"
96+
Requires-Dist: pathspec==0.12.1; extra == "dev"
97+
Requires-Dist: pkginfo==1.10.0; extra == "dev"
98+
Requires-Dist: platformdirs==4.3.6; extra == "dev"
99+
Requires-Dist: prometheus-client==0.21.0; extra == "dev"
100+
Requires-Dist: prompt-toolkit==3.0.47; extra == "dev"
101+
Requires-Dist: psutil==6.0.0; extra == "dev"
102+
Requires-Dist: pure-eval==0.2.3; extra == "dev"
103+
Requires-Dist: pycparser==2.22; extra == "dev"
104+
Requires-Dist: pygments==2.18.0; extra == "dev"
105+
Requires-Dist: python-dateutil==2.9.0.post0; extra == "dev"
106+
Requires-Dist: python-json-logger==2.0.7; extra == "dev"
107+
Requires-Dist: pywin32==306; extra == "dev"
108+
Requires-Dist: pywin32-ctypes==0.2.3; extra == "dev"
109+
Requires-Dist: pywinpty==2.0.13; extra == "dev"
110+
Requires-Dist: pyyaml==6.0.2; extra == "dev"
111+
Requires-Dist: pyzmq==26.2.0; extra == "dev"
112+
Requires-Dist: readme-renderer==44.0; extra == "dev"
113+
Requires-Dist: referencing==0.35.1; extra == "dev"
114+
Requires-Dist: requests==2.32.3; extra == "dev"
115+
Requires-Dist: requests-toolbelt==1.0.0; extra == "dev"
116+
Requires-Dist: rfc3339-validator==0.1.4; extra == "dev"
117+
Requires-Dist: rfc3986==2.0.0; extra == "dev"
118+
Requires-Dist: rfc3986-validator==0.1.1; extra == "dev"
119+
Requires-Dist: rich==13.8.1; extra == "dev"
120+
Requires-Dist: rpds-py==0.20.0; extra == "dev"
121+
Requires-Dist: ruff==0.6.6; extra == "dev"
122+
Requires-Dist: send2trash==1.8.3; extra == "dev"
123+
Requires-Dist: six==1.16.0; extra == "dev"
124+
Requires-Dist: sniffio==1.3.1; extra == "dev"
125+
Requires-Dist: soupsieve==2.6; extra == "dev"
126+
Requires-Dist: stack-data==0.6.3; extra == "dev"
127+
Requires-Dist: terminado==0.18.1; extra == "dev"
128+
Requires-Dist: tinycss2==1.3.0; extra == "dev"
129+
Requires-Dist: tornado==6.4.1; extra == "dev"
130+
Requires-Dist: traitlets==5.14.3; extra == "dev"
131+
Requires-Dist: twine==5.1.1; extra == "dev"
132+
Requires-Dist: types-python-dateutil==2.9.0.20240906; extra == "dev"
133+
Requires-Dist: uri-template==1.3.0; extra == "dev"
134+
Requires-Dist: urllib3==2.2.3; extra == "dev"
135+
Requires-Dist: wcwidth==0.2.13; extra == "dev"
136+
Requires-Dist: webcolors==24.8.0; extra == "dev"
137+
Requires-Dist: webencodings==0.5.1; extra == "dev"
138+
Requires-Dist: websocket-client==1.8.0; extra == "dev"
139+
Requires-Dist: zipp==3.20.2; extra == "dev"
140+
141+
# THANK YOU FOR USING TEXTSPITTER!! #
142+
143+
I created this little app to help me process documents from folder sets and batches. Instead of trying to determine each file type and process accordingly, I thought it would be more prudent to read file names and then route text extraction functions accordingly. Also, I was having a really difficult time getting textract/pdftotext to work **because of damn Poppler**. So instead of troubleshooting that whole process after 6+ hours, I figured this was more time-efficient.
144+
145+
This is my first python module, so I hope I did this well!
146+
147+
## Installation ##
148+
* Type `pip install TextSpitter`
149+
* **OPTIONAL** type `pip install PyMuPDF` to install the Python-MuPDF engine for better fidelity with text extraction (i.e.: maintaining correct White Spacing)
150+
* You will need to follow instructions to ensure that PyMuPDF's dependencies install to your system. There are wheels and binaries available for Windows, Linux, and MacOSX, though if you're on something weird like NetBSD/FreeBSD/specialty linux distros, you may e SOL. Fortunately, CLI options like Yum, Pkgin, Apt-Get and so forth will have packages available straight from the terminal.
151+
* For detailed instructions, please visit here: https://github.com/rk700/PyMuPDF and maybe give those guys some kudos, because they worked their tails off.
152+
153+
## Directions ##
154+
This module is designed to run as simply as possible. Just provide the file location string data into the argument, and get your text returned to you.
155+
156+
```
157+
from TextSpitter import TextSpitter as TS
158+
folder_loc = 'foo/bar/'
159+
160+
docx_file = folder_loc + 'file_thing.docx'
161+
pdf_file = folder_loc + 'file_thing.pdf'
162+
text_file = folder_loc + 'file_thing.txt'
163+
164+
doc_tup = (docx_file, pdf_file, text_file)
165+
166+
raw_text_payload = [TS(filename=ele) for ele in doc_tup]
167+
text = '\n'.join(raw_text_payload)
168+
return text
169+
```
170+
171+
## TO DOs ##
172+
* [x] spruce up documentation
173+
* [X] Add stream functionality for s3-based file reading
174+
* [ ] expand functionality to other file types
175+
* [ ] TDB
176+
177+
## WANT TO CONTRIBUTE!? ##
178+
_*OH MY GOD, PLEASE DO.*_
179+
180+
Just make a pull request and add whatever you want (or fix whatever you want). I'll review and approve if everything seems good.
181+
182+
Thanks, everyone!
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
LICENSE
2+
README.md
3+
core_requirements.txt
4+
dev_requirements.txt
5+
pyproject.toml
6+
setup.py
7+
TextSpitter/__init__.py
8+
TextSpitter/core.py
9+
TextSpitter/logger.py
10+
TextSpitter/main.py
11+
textspitter_fsecada01.egg-info/PKG-INFO
12+
textspitter_fsecada01.egg-info/SOURCES.txt
13+
textspitter_fsecada01.egg-info/dependency_links.txt
14+
textspitter_fsecada01.egg-info/requires.txt
15+
textspitter_fsecada01.egg-info/top_level.txt

0 commit comments

Comments
 (0)