|
1 | 1 | import mimetypes |
2 | 2 | from docx import Document |
3 | 3 | from io import BytesIO |
| 4 | +from typing import BinaryIO |
4 | 5 |
|
5 | 6 |
|
6 | | -def PdfFileRead(file): |
7 | | - """This current code provides a workaround in case MuPDF (a dependency for |
8 | | - PyMuPDF) is not usable in the development environment. For such instances, |
9 | | - the module relies on PyPDF2 to extract text data. However, because of the |
10 | | - likelihood of white spaces being rampant in the extracted string data, |
11 | | - those characters get filtered out.""" |
| 7 | +class FileExtractor: |
| 8 | + """ |
| 9 | + Wrapper for extracting file contents to string |
| 10 | + """ |
12 | 11 |
|
13 | | - try: |
14 | | - import fitz |
| 12 | + def __init__(self, file: str or BinaryIO): |
| 13 | + if type(file) == BinaryIO: |
| 14 | + self.file = file |
| 15 | + self.name = file.name |
| 16 | + elif type(file) == str: |
| 17 | + self.file = None |
| 18 | + self.name = file |
15 | 19 |
|
16 | | - with file.open() as f: |
17 | | - pdf_file = fitz.Document(stream=f.read(), filetype="pdf") |
| 20 | + @staticmethod |
| 21 | + def get_file_type(file): |
| 22 | + mime_type = mimetypes.guess_type(file)[0] |
| 23 | + guess_file_type = mime_type.split("/")[1] |
| 24 | + return guess_file_type |
| 25 | + |
| 26 | + def get_contents(self): |
| 27 | + write_mode = "rb+" |
| 28 | + if ".txt" in self.name: |
| 29 | + write_mode = "r+" |
| 30 | + if self.file: |
| 31 | + file = self.file |
| 32 | + with file.open() as f: |
| 33 | + contents = f.read() |
| 34 | + else: |
| 35 | + with open(self.name, write_mode) as f: |
| 36 | + contents = f.read() |
| 37 | + return contents |
| 38 | + |
| 39 | + def PdfFileRead(self): |
| 40 | + """This current code provides a workaround in case MuPDF (a dependency for |
| 41 | + PyMuPDF) is not usable in the development environment. For such instances, |
| 42 | + the module relies on PyPDF2 to extract text data. However, because of the |
| 43 | + likelihood of white spaces being rampant in the extracted string data, |
| 44 | + those characters get filtered out.""" |
| 45 | + |
| 46 | + contents = self.get_contents() |
| 47 | + |
| 48 | + try: |
| 49 | + import fitz |
| 50 | + |
| 51 | + pdf_file = fitz.Document(stream=contents, filetype="pdf") |
18 | 52 | raw_text = [ele.getText("text") for ele in pdf_file] |
19 | 53 | text = "".join(raw_text) |
20 | | - # else: |
21 | | - except Exception: |
22 | | - import PyPDF2 |
| 54 | + # else: |
| 55 | + except Exception: |
| 56 | + import PyPDF2 |
23 | 57 |
|
24 | | - with open(file, "rb") as f: |
25 | | - pdf_reader = PyPDF2.PdfFileReader(f) |
| 58 | + pdf_reader = PyPDF2.PdfFileReader(contents) |
26 | 59 | raw_text = [ele.extractText() for ele in pdf_reader.pages] |
27 | 60 | text = "".join(raw_text) |
28 | | - return text |
| 61 | + return text |
29 | 62 |
|
30 | | - |
31 | | -def DocxFileRead(file): |
32 | | - with file.open() as f: |
33 | | - f_stream = BytesIO(f.read()) |
| 63 | + def DocxFileRead(self): |
| 64 | + contents = self.get_contents() |
| 65 | + f_stream = BytesIO(contents) |
34 | 66 | document = Document(f_stream) |
35 | 67 | raw_text = [p.text for p in document.paragraphs] |
36 | 68 | text = "\n".join(raw_text) |
37 | | - return text |
38 | | - |
39 | | - |
40 | | -def TextFileRead(file): |
41 | | - return open(file, "r").read() |
42 | | - |
| 69 | + return text |
43 | 70 |
|
44 | | -def get_file_type(file): |
45 | | - mime_type = mimetypes.guess_type(file)[0] |
46 | | - guess_file_type = mime_type.split("/")[1] |
47 | | - return guess_file_type |
| 71 | + def TextFileRead(self): |
| 72 | + return self.get_contents() |
0 commit comments