Skip to content

Commit e5320e5

Browse files
committed
moved utility functions into wrapper object; requirements updates
1 parent d69f82a commit e5320e5

9 files changed

Lines changed: 159 additions & 131 deletions

File tree

338 Bytes
Binary file not shown.
2.87 KB
Binary file not shown.
1.2 KB
Binary file not shown.

TextSpitter/core.py

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,72 @@
11
import mimetypes
22
from docx import Document
33
from io import BytesIO
4+
from typing import BinaryIO
45

56

6-
def PdfFileRead(file):
7-
"""This current code provides a workaround in case MuPDF (a dependency for
8-
PyMuPDF) is not usable in the development environment. For such instances,
9-
the module relies on PyPDF2 to extract text data. However, because of the
10-
likelihood of white spaces being rampant in the extracted string data,
11-
those characters get filtered out."""
7+
class FileExtractor:
8+
"""
9+
Wrapper for extracting file contents to string
10+
"""
1211

13-
try:
14-
import fitz
12+
def __init__(self, file: str or BinaryIO):
13+
if type(file) == BinaryIO:
14+
self.file = file
15+
self.name = file.name
16+
elif type(file) == str:
17+
self.file = None
18+
self.name = file
1519

16-
with file.open() as f:
17-
pdf_file = fitz.Document(stream=f.read(), filetype="pdf")
20+
@staticmethod
21+
def get_file_type(file):
22+
mime_type = mimetypes.guess_type(file)[0]
23+
guess_file_type = mime_type.split("/")[1]
24+
return guess_file_type
25+
26+
def get_contents(self):
27+
write_mode = "rb+"
28+
if ".txt" in self.name:
29+
write_mode = "r+"
30+
if self.file:
31+
file = self.file
32+
with file.open() as f:
33+
contents = f.read()
34+
else:
35+
with open(self.name, write_mode) as f:
36+
contents = f.read()
37+
return contents
38+
39+
def PdfFileRead(self):
40+
"""This current code provides a workaround in case MuPDF (a dependency for
41+
PyMuPDF) is not usable in the development environment. For such instances,
42+
the module relies on PyPDF2 to extract text data. However, because of the
43+
likelihood of white spaces being rampant in the extracted string data,
44+
those characters get filtered out."""
45+
46+
contents = self.get_contents()
47+
48+
try:
49+
import fitz
50+
51+
pdf_file = fitz.Document(stream=contents, filetype="pdf")
1852
raw_text = [ele.getText("text") for ele in pdf_file]
1953
text = "".join(raw_text)
20-
# else:
21-
except Exception:
22-
import PyPDF2
54+
# else:
55+
except Exception:
56+
import PyPDF2
2357

24-
with open(file, "rb") as f:
25-
pdf_reader = PyPDF2.PdfFileReader(f)
58+
pdf_reader = PyPDF2.PdfFileReader(contents)
2659
raw_text = [ele.extractText() for ele in pdf_reader.pages]
2760
text = "".join(raw_text)
28-
return text
61+
return text
2962

30-
31-
def DocxFileRead(file):
32-
with file.open() as f:
33-
f_stream = BytesIO(f.read())
63+
def DocxFileRead(self):
64+
contents = self.get_contents()
65+
f_stream = BytesIO(contents)
3466
document = Document(f_stream)
3567
raw_text = [p.text for p in document.paragraphs]
3668
text = "\n".join(raw_text)
37-
return text
38-
39-
40-
def TextFileRead(file):
41-
return open(file, "r").read()
42-
69+
return text
4370

44-
def get_file_type(file):
45-
mime_type = mimetypes.guess_type(file)[0]
46-
guess_file_type = mime_type.split("/")[1]
47-
return guess_file_type
71+
def TextFileRead(self):
72+
return self.get_contents()

TextSpitter/main.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
from .core import *
1+
from .core import FileExtractor
2+
import mimetypes
23

34

45
class WordLoader:
56
def __init__(self, file):
67
self.name = file
7-
self.text = str(file)
8+
self.extractor = FileExtractor(file)
89

910
def file_load(self):
1011
file_loc = self.name
@@ -15,13 +16,13 @@ def file_load(self):
1516
file_types_tup = ("pdf", "docx", "txt", "text")
1617
if file_type in file_types_tup:
1718
if file_type == file_types_tup[0]:
18-
text = PdfFileRead(self.name)
19+
text = self.extractor.PdfFileRead()
1920
elif file_type == file_types_tup[1]:
20-
text = DocxFileRead(self.name)
21+
text = self.extractor.DocxFileRead()
2122
# elif file_type == file_types_tup[2]:
2223
# text = DocFileRead(self.text)
2324
else:
24-
text = TextFileRead(self.text)
25+
text = self.extractor.TextFileRead()
2526
return text
2627
else:
2728
mime_type = mimetypes.guess_type(file_loc)

core_requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
#
2-
# This file is autogenerated by pip-compile
2+
# This file is autogenerated by pip-compile with python 3.9
33
# To update, run:
44
#
55
# pip-compile core_requirements.in
66
#
77
lxml==4.6.3
88
# via python-docx
9-
pymupdf==1.18.12
9+
pymupdf==1.18.19
1010
# via -r core_requirements.in
1111
pypdf2==1.26.0
1212
# via -r core_requirements.in
13-
python-docx==0.8.10
13+
python-docx==0.8.11
1414
# via -r core_requirements.in

0 commit comments

Comments
 (0)