Skip to content

Commit d69f82a

Browse files
committed
Patch for import differing pdf-related modules for processing PDF files
1 parent 65b1cfb commit d69f82a

2 files changed

Lines changed: 15 additions & 15 deletions

File tree

TextSpitter/core.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,30 @@
11
import mimetypes
22
from docx import Document
33
from io import BytesIO
4-
try:
5-
import fitz
6-
except Exception:
7-
import PyPDF2
84

95

106
def PdfFileRead(file):
11-
'''This current code provides a workaround in case MuPDF (a dependency for
7+
"""This current code provides a workaround in case MuPDF (a dependency for
128
PyMuPDF) is not usable in the development environment. For such instances,
139
the module relies on PyPDF2 to extract text data. However, because of the
1410
likelihood of white spaces being rampant in the extracted string data,
15-
those characters get filtered out.'''
11+
those characters get filtered out."""
1612

1713
try:
14+
import fitz
15+
1816
with file.open() as f:
19-
pdf_file = fitz.Document(stream=f.read(), filetype='pdf')
20-
raw_text = [ele.getText('text') for ele in pdf_file]
21-
text = ''.join(raw_text)
17+
pdf_file = fitz.Document(stream=f.read(), filetype="pdf")
18+
raw_text = [ele.getText("text") for ele in pdf_file]
19+
text = "".join(raw_text)
2220
# else:
2321
except Exception:
24-
with open(file, 'rb') as f:
22+
import PyPDF2
23+
24+
with open(file, "rb") as f:
2525
pdf_reader = PyPDF2.PdfFileReader(f)
2626
raw_text = [ele.extractText() for ele in pdf_reader.pages]
27-
text = ''.join(raw_text)
27+
text = "".join(raw_text)
2828
return text
2929

3030

@@ -33,15 +33,15 @@ def DocxFileRead(file):
3333
f_stream = BytesIO(f.read())
3434
document = Document(f_stream)
3535
raw_text = [p.text for p in document.paragraphs]
36-
text = '\n'.join(raw_text)
36+
text = "\n".join(raw_text)
3737
return text
3838

3939

4040
def TextFileRead(file):
41-
return open(file, 'r').read()
41+
return open(file, "r").read()
4242

4343

4444
def get_file_type(file):
4545
mime_type = mimetypes.guess_type(file)[0]
46-
guess_file_type = mime_type.split('/')[1]
46+
guess_file_type = mime_type.split("/")[1]
4747
return guess_file_type

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="TextSpitter",
8-
version="0.3.3",
8+
version="0.3.3_Post_0",
99
author="Francis Secada",
1010
author_email="francis.secada@gmail.com",
1111
description="Python package that spits out text from your document files!",

0 commit comments

Comments
 (0)