Skip to content

Commit 0de893f

Browse files
committed
Small patch to fix DOCX functionality
1 parent 9d4613b commit 0de893f

3 files changed

Lines changed: 6 additions & 5 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ build/
22
dist/
33
TextSpitter.egg-info/
44
v */
5+
git_push.bat

TextSpitter/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ def PdfFileRead(file):
1515
those characters get filtered out.'''
1616

1717
try:
18-
with file.open().read() as f:
19-
pdf_file = fitz.Document(stream=f, filetype='pdf')
18+
with file.open() as f:
19+
pdf_file = fitz.Document(stream=f.read(), filetype='pdf')
2020
raw_text = [ele.getText('text') for ele in pdf_file]
2121
text = ''.join(raw_text)
2222
# else:
@@ -29,8 +29,8 @@ def PdfFileRead(file):
2929

3030

3131
def DocxFileRead(file):
32-
with file.open().read() as f:
33-
f_stream = BytesIO(f)
32+
with file.open() as f:
33+
f_stream = BytesIO(f.read())
3434
document = Document(f_stream)
3535
raw_text = [p.text for p in document.paragraphs]
3636
text = '\n'.join(raw_text)

TextSpitter/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def file_load(self):
1717
if file_type == file_types_tup[0]:
1818
text = PdfFileRead(self.name)
1919
elif file_type == file_types_tup[1]:
20-
text = DocxFileRead(self.text)
20+
text = DocxFileRead(self.name)
2121
# elif file_type == file_types_tup[2]:
2222
# text = DocFileRead(self.text)
2323
else:

0 commit comments

Comments
 (0)