Skip to content

Commit 9cacdde

Browse files
committed
patching TextSpitter.core to explicitly convert filename contents into File objects
1 parent fb64a60 commit 9cacdde

9 files changed

Lines changed: 2304 additions & 80 deletions

File tree

.idea/ryecharm-overrides.xml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/ryecharm.xml

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/pre-commit/pre-commit-hooks
3-
rev: v4.6.0
3+
rev: v5.0.0
44
hooks:
55
- id: check-added-large-files
66
exclude: bin/
@@ -34,7 +34,7 @@ repos:
3434

3535
- repo: https://github.com/charliermarsh/ruff-pre-commit
3636
# Ruff version.
37-
rev: "v0.6.5"
37+
rev: "v0.8.0"
3838
hooks:
3939
- id: ruff
4040

@@ -45,7 +45,7 @@ repos:
4545
name: isort (python)
4646

4747
- repo: https://github.com/psf/black
48-
rev: 24.8.0
48+
rev: 24.10.0
4949
hooks:
5050
- id: black
5151
exclude: tests/

TextSpitter/core.py

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
import mimetypes
6-
from io import BytesIO, FileIO
6+
from io import BytesIO
77
from pathlib import Path
88
from typing import IO
99

@@ -25,31 +25,36 @@ def __init__(
2525
object's file property; if a file-like object is provided instead of a
2626
name, then a file_ext arg will be required.
2727
28-
`file_name` is depreciated.
28+
`filename` is depreciated.
2929
3030
Args:
3131
file_obj: str | Path | None
3232
filename: : str | None
3333
"""
3434

3535
if filename:
36-
self.file = FileIO(filename)
36+
self.file = Path(filename)
3737
self.file_ext = filename.split(".")[-1]
38-
elif isinstance(file_obj, str):
39-
file_obj = Path(file_obj)
40-
41-
elif file_obj and any((isinstance(file_obj, x) for x in (Path, IO))):
42-
pass
43-
44-
if hasattr(file_obj, "name"):
45-
self.file = file_obj
46-
self.file_ext = file_obj.name.split(".")[-1]
38+
self.file_name = self.file.name
4739
else:
48-
raise Exception(
49-
"Your file object does not contain a name attribute. Please"
50-
" add a name attribute with a file extension, and try "
51-
"again. Need the file ext. data for mime-typing."
52-
)
40+
if isinstance(file_obj, str):
41+
file_obj = Path(file_obj)
42+
43+
elif file_obj and any(
44+
(isinstance(file_obj, x) for x in (Path, IO))
45+
):
46+
pass
47+
48+
if hasattr(file_obj, "name"):
49+
self.file = file_obj
50+
self.file_ext = file_obj.name.split(".")[-1]
51+
self.file_name = file_obj.name
52+
else:
53+
raise Exception(
54+
"Your file object does not contain a name attribute. Please"
55+
" add a name attribute with a file extension, and try "
56+
"again. Need the file ext. data for mime-typing."
57+
)
5358

5459
@staticmethod
5560
def get_file_type(file: str | Path):
@@ -74,7 +79,11 @@ def get_contents(self):
7479
Returns:
7580
str | int | bytes
7681
"""
77-
mime_type = self.get_file_type(self.file)
82+
mime_type = (
83+
self.get_file_type(self.file)
84+
if isinstance(self.file, str)
85+
else self.get_file_type(self.file_name)
86+
)
7887
open_mode = "r" if "text" in mime_type else "rb+"
7988
with self.file.open(open_mode) as f:
8089
return f.read()

core_requirements.txt

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
1-
#
2-
# This file is autogenerated by pip-compile with Python 3.12
3-
# by the following command:
4-
#
5-
# pip-compile core_requirements.in
6-
#
1+
# This file was autogenerated by uv via the following command:
2+
# uv pip compile core_requirements.in -o core_requirements.txt
73
colorama==0.4.6
84
# via loguru
95
loguru==0.7.2
106
# via -r core_requirements.in
117
lxml==5.3.0
128
# via python-docx
13-
pymupdf==1.24.10
9+
pymupdf==1.24.14
1410
# via -r core_requirements.in
15-
pymupdfb==1.24.10
16-
# via pymupdf
1711
pypdf2==3.0.1
1812
# via -r core_requirements.in
1913
python-docx==1.1.2

dev_requirements.txt

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
#
2-
# This file is autogenerated by pip-compile with Python 3.12
3-
# by the following command:
4-
#
5-
# pip-compile dev_requirements.in
6-
#
7-
anyio==4.5.0
1+
# This file was autogenerated by uv via the following command:
2+
# uv pip compile dev_requirements.in -o dev_requirements.txt
3+
anyio==4.6.2.post1
84
# via
95
# httpx
106
# jupyter-server
@@ -26,9 +22,9 @@ babel==2.16.0
2622
# via jupyterlab-server
2723
beautifulsoup4==4.12.3
2824
# via nbconvert
29-
black==24.8.0
25+
black==24.10.0
3026
# via -r dev_requirements.in
31-
bleach==6.1.0
27+
bleach==6.2.0
3228
# via nbconvert
3329
certifi==2024.8.30
3430
# via
@@ -37,7 +33,7 @@ certifi==2024.8.30
3733
# requests
3834
cffi==1.17.1
3935
# via argon2-cffi-bindings
40-
charset-normalizer==3.3.2
36+
charset-normalizer==3.4.0
4137
# via requests
4238
click==8.1.7
4339
# via black
@@ -48,7 +44,7 @@ colorama==0.4.6
4844
# ipython
4945
comm==0.2.2
5046
# via ipykernel
51-
debugpy==1.8.5
47+
debugpy==1.8.9
5248
# via ipykernel
5349
decorator==5.1.1
5450
# via ipython
@@ -64,7 +60,7 @@ fqdn==1.5.1
6460
# via jsonschema
6561
h11==0.14.0
6662
# via httpcore
67-
httpcore==1.0.5
63+
httpcore==1.0.7
6864
# via httpx
6965
httpx==0.27.2
7066
# via jupyterlab
@@ -78,7 +74,7 @@ importlib-metadata==8.5.0
7874
# via twine
7975
ipykernel==6.29.5
8076
# via jupyterlab
81-
ipython==8.27.0
77+
ipython==8.29.0
8278
# via ipykernel
8379
isoduration==20.11.0
8480
# via jsonschema
@@ -88,26 +84,26 @@ jaraco-classes==3.4.0
8884
# via keyring
8985
jaraco-context==6.0.1
9086
# via keyring
91-
jaraco-functools==4.0.2
87+
jaraco-functools==4.1.0
9288
# via keyring
93-
jedi==0.19.1
89+
jedi==0.19.2
9490
# via ipython
9591
jinja2==3.1.4
9692
# via
9793
# jupyter-server
9894
# jupyterlab
9995
# jupyterlab-server
10096
# nbconvert
101-
json5==0.9.25
97+
json5==0.9.28
10298
# via jupyterlab-server
10399
jsonpointer==3.0.0
104100
# via jsonschema
105-
jsonschema[format-nongpl]==4.23.0
101+
jsonschema==4.23.0
106102
# via
107103
# jupyter-events
108104
# jupyterlab-server
109105
# nbformat
110-
jsonschema-specifications==2023.12.1
106+
jsonschema-specifications==2024.10.1
111107
# via jsonschema
112108
jupyter-client==8.6.3
113109
# via
@@ -136,19 +132,19 @@ jupyter-server==2.14.2
136132
# notebook-shim
137133
jupyter-server-terminals==0.5.3
138134
# via jupyter-server
139-
jupyterlab==4.2.5
135+
jupyterlab==4.3.1
140136
# via -r dev_requirements.in
141137
jupyterlab-code-formatter==3.0.2
142138
# via -r dev_requirements.in
143139
jupyterlab-pygments==0.3.0
144140
# via nbconvert
145141
jupyterlab-server==2.27.3
146142
# via jupyterlab
147-
keyring==25.4.1
143+
keyring==25.5.0
148144
# via twine
149145
markdown-it-py==3.0.0
150146
# via rich
151-
markupsafe==2.1.5
147+
markupsafe==3.0.2
152148
# via
153149
# jinja2
154150
# nbconvert
@@ -183,7 +179,7 @@ notebook-shim==0.2.4
183179
# via jupyterlab
184180
overrides==7.7.0
185181
# via jupyter-server
186-
packaging==24.1
182+
packaging==24.2
187183
# via
188184
# black
189185
# ipykernel
@@ -206,9 +202,9 @@ platformdirs==4.3.6
206202
# jupyter-core
207203
prometheus-client==0.21.0
208204
# via jupyter-server
209-
prompt-toolkit==3.0.47
205+
prompt-toolkit==3.0.48
210206
# via ipython
211-
psutil==6.0.0
207+
psutil==6.1.0
212208
# via ipykernel
213209
pure-eval==0.2.3
214210
# via stack-data
@@ -226,11 +222,11 @@ python-dateutil==2.9.0.post0
226222
# jupyter-client
227223
python-json-logger==2.0.7
228224
# via jupyter-events
229-
pywin32==306
225+
pywin32==308
230226
# via jupyter-core
231227
pywin32-ctypes==0.2.3
232228
# via keyring
233-
pywinpty==2.0.13
229+
pywinpty==2.0.14
234230
# via
235231
# jupyter-server
236232
# jupyter-server-terminals
@@ -266,20 +262,21 @@ rfc3986-validator==0.1.1
266262
# via
267263
# jsonschema
268264
# jupyter-events
269-
rich==13.8.1
265+
rich==13.9.4
270266
# via twine
271-
rpds-py==0.20.0
267+
rpds-py==0.21.0
272268
# via
273269
# jsonschema
274270
# referencing
275-
ruff==0.6.6
271+
ruff==0.8.0
276272
# via -r dev_requirements.in
277273
send2trash==1.8.3
278274
# via jupyter-server
275+
setuptools==75.6.0
276+
# via jupyterlab
279277
six==1.16.0
280278
# via
281279
# asttokens
282-
# bleach
283280
# python-dateutil
284281
# rfc3339-validator
285282
sniffio==1.3.1
@@ -294,9 +291,9 @@ terminado==0.18.1
294291
# via
295292
# jupyter-server
296293
# jupyter-server-terminals
297-
tinycss2==1.3.0
294+
tinycss2==1.4.0
298295
# via nbconvert
299-
tornado==6.4.1
296+
tornado==6.4.2
300297
# via
301298
# ipykernel
302299
# jupyter-client
@@ -319,7 +316,7 @@ traitlets==5.14.3
319316
# nbformat
320317
twine==5.1.1
321318
# via -r dev_requirements.in
322-
types-python-dateutil==2.9.0.20240906
319+
types-python-dateutil==2.9.0.20241003
323320
# via arrow
324321
uri-template==1.3.0
325322
# via jsonschema
@@ -329,16 +326,13 @@ urllib3==2.2.3
329326
# twine
330327
wcwidth==0.2.13
331328
# via prompt-toolkit
332-
webcolors==24.8.0
329+
webcolors==24.11.1
333330
# via jsonschema
334331
webencodings==0.5.1
335332
# via
336333
# bleach
337334
# tinycss2
338335
websocket-client==1.8.0
339336
# via jupyter-server
340-
zipp==3.20.2
337+
zipp==3.21.0
341338
# via importlib-metadata
342-
343-
# The following packages are considered to be unsafe in a requirements file:
344-
# setuptools

0 commit comments

Comments
 (0)