11import mimetypes
22from docx import Document
33from io import BytesIO
4- try :
5- import fitz
6- except Exception :
7- import PyPDF2
84
95
106def PdfFileRead (file ):
11- ''' This current code provides a workaround in case MuPDF (a dependency for
7+ """ This current code provides a workaround in case MuPDF (a dependency for
128 PyMuPDF) is not usable in the development environment. For such instances,
139 the module relies on PyPDF2 to extract text data. However, because of the
1410 likelihood of white spaces being rampant in the extracted string data,
15- those characters get filtered out.'''
11+ those characters get filtered out."""
1612
1713 try :
14+ import fitz
15+
1816 with file .open () as f :
19- pdf_file = fitz .Document (stream = f .read (), filetype = ' pdf' )
20- raw_text = [ele .getText (' text' ) for ele in pdf_file ]
21- text = '' .join (raw_text )
17+ pdf_file = fitz .Document (stream = f .read (), filetype = " pdf" )
18+ raw_text = [ele .getText (" text" ) for ele in pdf_file ]
19+ text = "" .join (raw_text )
2220 # else:
2321 except Exception :
24- with open (file , 'rb' ) as f :
22+ import PyPDF2
23+
24+ with open (file , "rb" ) as f :
2525 pdf_reader = PyPDF2 .PdfFileReader (f )
2626 raw_text = [ele .extractText () for ele in pdf_reader .pages ]
27- text = '' .join (raw_text )
27+ text = "" .join (raw_text )
2828 return text
2929
3030
@@ -33,15 +33,15 @@ def DocxFileRead(file):
3333 f_stream = BytesIO (f .read ())
3434 document = Document (f_stream )
3535 raw_text = [p .text for p in document .paragraphs ]
36- text = ' \n ' .join (raw_text )
36+ text = " \n " .join (raw_text )
3737 return text
3838
3939
4040def TextFileRead (file ):
41- return open (file , 'r' ).read ()
41+ return open (file , "r" ).read ()
4242
4343
4444def get_file_type (file ):
4545 mime_type = mimetypes .guess_type (file )[0 ]
46- guess_file_type = mime_type .split ('/' )[1 ]
46+ guess_file_type = mime_type .split ("/" )[1 ]
4747 return guess_file_type
0 commit comments