22Core application that contains the `FileExtractor` class object
33"""
44
5- # It's good practice to have a logger instance if you use it
65import logging
76import mimetypes
8- from io import BytesIO # Ensure BytesIO is imported from io
7+ from io import BytesIO
98from pathlib import Path
109from tempfile import SpooledTemporaryFile
1110from typing import IO
1211
1312from docx import Document
1413
15- # from loguru import logger # Assuming logger is configured if used
14+ # --- Module-level imports for optional PDF libraries ---
15+ try :
16+ import pymupdf # type: ignore
17+ except ImportError :
18+ pymupdf = None # Will be None if not installed
1619
20+ try :
21+ import pypdf # type: ignore
22+ except ImportError :
23+ pypdf = None # Will be None if not installed
24+ # --- End of module-level imports ---
1725
1826logger = logging .getLogger (__name__ )
1927
@@ -133,11 +141,109 @@ def get_file_type(
133141 "pdf" : "pdf" ,
134142 "txt" : "plain" ,
135143 "csv" : "csv" ,
144+ # Add programming language mappings
145+ "py" : "x-python" ,
146+ "js" : "javascript" ,
147+ "java" : "x-java-source" ,
148+ "c" : "x-c" ,
149+ "cpp" : "x-c++" ,
150+ "html" : "html" ,
151+ "css" : "css" ,
152+ "json" : "json" ,
153+ "xml" : "xml" ,
136154 }
137155 return ext_to_mime_subtype .get (
138156 ext , "octet-stream"
139157 ) # Default to octet-stream
140158
159+ @staticmethod
160+ def is_programming_language_file (file_ext : str ) -> bool :
161+ """
162+ Check if the file extension corresponds to a programming language file.
163+
164+ Args:
165+ file_ext: File extension (without dot)
166+
167+ Returns:
168+ bool: True if it's a programming language file
169+ """
170+ programming_extensions = {
171+ "py" ,
172+ "js" ,
173+ "ts" ,
174+ "java" ,
175+ "cpp" ,
176+ "c" ,
177+ "h" ,
178+ "hpp" ,
179+ "cs" ,
180+ "php" ,
181+ "rb" ,
182+ "go" ,
183+ "rs" ,
184+ "swift" ,
185+ "kt" ,
186+ "scala" ,
187+ "r" ,
188+ "sql" ,
189+ "sh" ,
190+ "bash" ,
191+ "zsh" ,
192+ "ps1" ,
193+ "bat" ,
194+ "cmd" ,
195+ "html" ,
196+ "htm" ,
197+ "css" ,
198+ "scss" ,
199+ "sass" ,
200+ "less" ,
201+ "xml" ,
202+ "json" ,
203+ "yaml" ,
204+ "yml" ,
205+ "toml" ,
206+ "ini" ,
207+ "cfg" ,
208+ "conf" ,
209+ "md" ,
210+ "rst" ,
211+ "tex" ,
212+ "latex" ,
213+ "vue" ,
214+ "jsx" ,
215+ "tsx" ,
216+ "dart" ,
217+ "pl" ,
218+ "pm" ,
219+ "lua" ,
220+ "vim" ,
221+ "asm" ,
222+ "s" ,
223+ "f" ,
224+ "f90" ,
225+ "f95" ,
226+ "cob" ,
227+ "cobol" ,
228+ "pas" ,
229+ "pp" ,
230+ "ml" ,
231+ "fs" ,
232+ "fsx" ,
233+ "elm" ,
234+ "clj" ,
235+ "cljs" ,
236+ "ex" ,
237+ "exs" ,
238+ "erl" ,
239+ "hrl" ,
240+ "jl" ,
241+ "nim" ,
242+ "cr" ,
243+ "zig" ,
244+ }
245+ return file_ext .lower () in programming_extensions
246+
141247 def get_contents (self ) -> bytes :
142248 """
143249 Reads the contents from self.file and returns it as bytes.
@@ -194,6 +300,36 @@ def get_contents(self) -> bytes:
194300 f"nor is it a Path or bytes."
195301 )
196302
303+ def code_file_read (self ) -> str :
304+ """
305+ Reads contents from programming language files (.py, .js, .java, etc.)
306+ with enhanced encoding detection and preserves original formatting.
307+
308+ Returns:
309+ str: The file content as a string
310+ """
311+ contents_bytes = self .get_contents ()
312+
313+ # Common encodings for source code files
314+ encodings_to_try = ["utf-8" , "utf-8-sig" , "latin-1" , "cp1252" ]
315+
316+ for encoding in encodings_to_try :
317+ try :
318+ content = contents_bytes .decode (encoding )
319+ logger .info (
320+ f"Successfully decoded { self .file_name } using { encoding } "
321+ )
322+ return content
323+ except UnicodeDecodeError :
324+ continue
325+
326+ # If all encodings fail, use utf-8 with replacement
327+ logger .warning (
328+ f"Could not decode code file { self .file_name } with standard "
329+ f"encodings, using utf-8 with replacement characters."
330+ )
331+ return contents_bytes .decode ("utf-8" , errors = "replace" )
332+
197333 def pdf_file_read (self ) -> str : # Added return type hint
198334 """
199335 This current code provides a workaround in case MuPDF (a dependency
@@ -203,9 +339,13 @@ def pdf_file_read(self) -> str: # Added return type hint
203339 extracted string data, those characters get filtered out.
204340 """
205341 contents = self .get_contents () # This should now reliably return bytes
342+ text = "" # Default to empty string
206343
207344 try :
208- import pymupdf # type: ignore
345+ if not pymupdf : # Check if module-level import was successful
346+ raise ImportError (
347+ "pymupdf module not available or import failed."
348+ )
209349
210350 # PyMuPDF's Document constructor can take bytes directly via the
211351 # 'stream' argument
@@ -220,23 +360,26 @@ def pdf_file_read(self) -> str: # Added return type hint
220360 f" { self .file_name } "
221361 )
222362 try :
223- import pypdf # type: ignore
363+ if not pypdf : # Check if module-level import was successful
364+ raise ImportError (
365+ "pypdf module not available or import failed."
366+ )
224367
225368 # PyPDF2 needs a stream, so wrap bytes in BytesIO
226369 pdf_stream = BytesIO (contents )
227370 pdf_reader = pypdf .PdfReader (pdf_stream )
228371 raw_text = [
229372 page .extract_text ()
230373 for page in pdf_reader .pages
231- if page .extract_text ()
374+ if page .extract_text () # Ensure text is not None or empty
232375 ]
233376 text = "" .join (raw_text )
234377 except Exception as e_pypdf :
235378 logger .error (
236379 f"Both PyMuPDF and PyPDF2 failed for PDF "
237380 f"{ self .file_name } : { e_pypdf } "
238381 )
239- text = "" # Return empty string on failure
382+ # text remains "" as initialized
240383 return text
241384
242385 def docx_file_read (self ) -> str : # Added return type hint
0 commit comments