Skip to content

Commit d6eb418

Browse files
committed
Adding enhancements for code files. Ready for 0.4.0 release
1 parent b81e9f5 commit d6eb418

10 files changed

Lines changed: 1179 additions & 616 deletions

File tree

.idea/encodings.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@ return text
3131
## TO DOs ##
3232
* [x] spruce up documentation
3333
* [X] Add stream functionality for s3-based file reading
34-
* [ ] expand functionality to other file types
34+
* [x] expand functionality to other file types (e.g., code files, improved CSV handling)
3535
* [ ] TDB
3636

3737
## WANT TO CONTRIBUTE!? ##
3838
_*OH MY GOD, PLEASE DO.*_
3939

4040
Just make a pull request and add whatever you want (or fix whatever you want). I'll review and approve if everything seems good.
4141

42-
Thanks, everyone!
42+
Thanks, everyone!

TextSpitter/core.py

Lines changed: 150 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,26 @@
22
Core application that contains the `FileExtractor` class object
33
"""
44

5-
# It's good practice to have a logger instance if you use it
65
import logging
76
import mimetypes
8-
from io import BytesIO # Ensure BytesIO is imported from io
7+
from io import BytesIO
98
from pathlib import Path
109
from tempfile import SpooledTemporaryFile
1110
from typing import IO
1211

1312
from docx import Document
1413

15-
# from loguru import logger # Assuming logger is configured if used
14+
# --- Module-level imports for optional PDF libraries ---
15+
try:
16+
import pymupdf # type: ignore
17+
except ImportError:
18+
pymupdf = None # Will be None if not installed
1619

20+
try:
21+
import pypdf # type: ignore
22+
except ImportError:
23+
pypdf = None # Will be None if not installed
24+
# --- End of module-level imports ---
1725

1826
logger = logging.getLogger(__name__)
1927

@@ -133,11 +141,109 @@ def get_file_type(
133141
"pdf": "pdf",
134142
"txt": "plain",
135143
"csv": "csv",
144+
# Add programming language mappings
145+
"py": "x-python",
146+
"js": "javascript",
147+
"java": "x-java-source",
148+
"c": "x-c",
149+
"cpp": "x-c++",
150+
"html": "html",
151+
"css": "css",
152+
"json": "json",
153+
"xml": "xml",
136154
}
137155
return ext_to_mime_subtype.get(
138156
ext, "octet-stream"
139157
) # Default to octet-stream
140158

159+
@staticmethod
160+
def is_programming_language_file(file_ext: str) -> bool:
161+
"""
162+
Check if the file extension corresponds to a programming language file.
163+
164+
Args:
165+
file_ext: File extension (without dot)
166+
167+
Returns:
168+
bool: True if it's a programming language file
169+
"""
170+
programming_extensions = {
171+
"py",
172+
"js",
173+
"ts",
174+
"java",
175+
"cpp",
176+
"c",
177+
"h",
178+
"hpp",
179+
"cs",
180+
"php",
181+
"rb",
182+
"go",
183+
"rs",
184+
"swift",
185+
"kt",
186+
"scala",
187+
"r",
188+
"sql",
189+
"sh",
190+
"bash",
191+
"zsh",
192+
"ps1",
193+
"bat",
194+
"cmd",
195+
"html",
196+
"htm",
197+
"css",
198+
"scss",
199+
"sass",
200+
"less",
201+
"xml",
202+
"json",
203+
"yaml",
204+
"yml",
205+
"toml",
206+
"ini",
207+
"cfg",
208+
"conf",
209+
"md",
210+
"rst",
211+
"tex",
212+
"latex",
213+
"vue",
214+
"jsx",
215+
"tsx",
216+
"dart",
217+
"pl",
218+
"pm",
219+
"lua",
220+
"vim",
221+
"asm",
222+
"s",
223+
"f",
224+
"f90",
225+
"f95",
226+
"cob",
227+
"cobol",
228+
"pas",
229+
"pp",
230+
"ml",
231+
"fs",
232+
"fsx",
233+
"elm",
234+
"clj",
235+
"cljs",
236+
"ex",
237+
"exs",
238+
"erl",
239+
"hrl",
240+
"jl",
241+
"nim",
242+
"cr",
243+
"zig",
244+
}
245+
return file_ext.lower() in programming_extensions
246+
141247
def get_contents(self) -> bytes:
142248
"""
143249
Reads the contents from self.file and returns it as bytes.
@@ -194,6 +300,36 @@ def get_contents(self) -> bytes:
194300
f"nor is it a Path or bytes."
195301
)
196302

303+
def code_file_read(self) -> str:
304+
"""
305+
Reads contents from programming language files (.py, .js, .java, etc.)
306+
with enhanced encoding detection and preserves original formatting.
307+
308+
Returns:
309+
str: The file content as a string
310+
"""
311+
contents_bytes = self.get_contents()
312+
313+
# Common encodings for source code files
314+
encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
315+
316+
for encoding in encodings_to_try:
317+
try:
318+
content = contents_bytes.decode(encoding)
319+
logger.info(
320+
f"Successfully decoded {self.file_name} using {encoding}"
321+
)
322+
return content
323+
except UnicodeDecodeError:
324+
continue
325+
326+
# If all encodings fail, use utf-8 with replacement
327+
logger.warning(
328+
f"Could not decode code file {self.file_name} with standard "
329+
f"encodings, using utf-8 with replacement characters."
330+
)
331+
return contents_bytes.decode("utf-8", errors="replace")
332+
197333
def pdf_file_read(self) -> str: # Added return type hint
198334
"""
199335
This current code provides a workaround in case MuPDF (a dependency
@@ -203,9 +339,13 @@ def pdf_file_read(self) -> str: # Added return type hint
203339
extracted string data, those characters get filtered out.
204340
"""
205341
contents = self.get_contents() # This should now reliably return bytes
342+
text = "" # Default to empty string
206343

207344
try:
208-
import pymupdf # type: ignore
345+
if not pymupdf: # Check if module-level import was successful
346+
raise ImportError(
347+
"pymupdf module not available or import failed."
348+
)
209349

210350
# PyMuPDF's Document constructor can take bytes directly via the
211351
# 'stream' argument
@@ -220,23 +360,26 @@ def pdf_file_read(self) -> str: # Added return type hint
220360
f" {self.file_name}"
221361
)
222362
try:
223-
import pypdf # type: ignore
363+
if not pypdf: # Check if module-level import was successful
364+
raise ImportError(
365+
"pypdf module not available or import failed."
366+
)
224367

225368
# PyPDF2 needs a stream, so wrap bytes in BytesIO
226369
pdf_stream = BytesIO(contents)
227370
pdf_reader = pypdf.PdfReader(pdf_stream)
228371
raw_text = [
229372
page.extract_text()
230373
for page in pdf_reader.pages
231-
if page.extract_text()
374+
if page.extract_text() # Ensure text is not None or empty
232375
]
233376
text = "".join(raw_text)
234377
except Exception as e_pypdf:
235378
logger.error(
236379
f"Both PyMuPDF and PyPDF2 failed for PDF "
237380
f"{self.file_name}: {e_pypdf}"
238381
)
239-
text = "" # Return empty string on failure
382+
# text remains "" as initialized
240383
return text
241384

242385
def docx_file_read(self) -> str: # Added return type hint

TextSpitter/main.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,26 +37,61 @@ def file_load(self):
3737
Returns:
3838
str
3939
"""
40-
file_type = self.file.file_ext
41-
# file_type = file_loc.split('.')[-1]
40+
file_type = self.file.file_ext.lower()
4241

43-
# file_types_tup = ('pdf', 'docx', 'doc', 'txt', 'text')
42+
# Primary file extension mapping
4443
file_ext_matrix = {
4544
"pdf": "pdf_file_read",
4645
"docx": "docx_file_read",
4746
"txt": "text_file_read",
4847
"text": "text_file_read",
4948
"csv": "csv_file_read",
5049
}
50+
51+
# Check if it's a specific supported format first
5152
if file_type in file_ext_matrix:
5253
text = getattr(self.file, file_ext_matrix[file_type])()
5354
return text
55+
# Check if it's a programming language file
56+
elif self.file.is_programming_language_file(file_type):
57+
logger.info(
58+
f"Processing programming language file: {self.file.file_name}"
59+
)
60+
text = self.file.code_file_read()
61+
return text
5462
else:
55-
mime_type = self.file.get_file_type(self.file.file.name)
63+
# Fall back to mime type detection
64+
mime_type = self.file.get_file_type(self.file.file_name)
65+
66+
# Check if mime type suggests it's a text-based file
67+
text_mime_types = [
68+
"plain",
69+
"javascript",
70+
"x-python",
71+
"x-c",
72+
"x-java-source",
73+
"x-c++",
74+
"html",
75+
"css",
76+
"json",
77+
"xml",
78+
]
79+
80+
if mime_type in text_mime_types:
81+
logger.info(
82+
f"Processing text-based file by mime type: {mime_type}"
83+
)
84+
text = (
85+
self.file.code_file_read()
86+
) # Use code_file_read for better encoding handling
87+
return text
5688

5789
logger.error(
5890
f"You are using an incorrect file format for file submissions. "
59-
f"Please upload a .docx/.doc/.txt/.pdf file OR! Note the "
60-
f"mimetype of your submitted data and submit an error report "
61-
f"to github with the following: {mime_type}"
91+
f"Please upload a .docx/.doc/.txt/.pdf file or a supported "
92+
f"programming language file (.py, .js, .java, .cpp, etc.). "
93+
f"Note the mimetype of your submitted data and submit an "
94+
f"error report to github with the following: {mime_type}"
6295
)
96+
97+
return ""

core_requirements.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ PyMuPDF
33
pypdf
44
python-docx
55
pytest
6-
pytest-lazy-fixtures
6+
pytest-lazy-fixtures
7+
pytest-mock

core_requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@ pytest==8.4.0
2424
# via
2525
# -r core_requirements.in
2626
# pytest-lazy-fixtures
27+
# pytest-mock
2728
pytest-lazy-fixtures==1.1.4
2829
# via -r core_requirements.in
30+
pytest-mock==3.14.1
31+
# via -r core_requirements.in
2932
python-docx==1.1.2
3033
# via -r core_requirements.in
3134
typing-extensions==4.14.0

dev_requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ referencing==0.36.2
256256
# jsonschema
257257
# jsonschema-specifications
258258
# jupyter-events
259-
requests==2.32.3
259+
requests==2.32.4
260260
# via
261261
# id
262262
# jupyterlab-server
@@ -280,7 +280,7 @@ rpds-py==0.25.1
280280
# via
281281
# jsonschema
282282
# referencing
283-
ruff==0.11.12
283+
ruff==0.11.13
284284
# via -r dev_requirements.in
285285
send2trash==1.8.3
286286
# via jupyter-server

0 commit comments

Comments
 (0)