fsecada01
diff --git a/‎.idea/encodings.xml‎
Lines changed: 6 additions & 0 deletions b/‎.idea/encodings.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎TextSpitter/core.py‎
Lines changed: 150 additions & 7 deletions b/‎TextSpitter/core.py‎
Lines changed: 150 additions & 7 deletions
diff --git a/‎TextSpitter/main.py‎
Lines changed: 42 additions & 7 deletions b/‎TextSpitter/main.py‎
Lines changed: 42 additions & 7 deletions
diff --git a/‎core_requirements.in‎
Lines changed: 2 additions & 1 deletion b/‎core_requirements.in‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core_requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎core_requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎dev_requirements.txt‎
Lines changed: 2 additions & 2 deletions b/‎dev_requirements.txt‎
Lines changed: 2 additions & 2 deletions
@@ -31,12 +31,12 @@ return text
 ## TO DOs ##
 * [x] spruce up documentation
 * [X] Add stream functionality for s3-based file reading
-* [ ] expand functionality to other file types
+* [x] expand functionality to other file types (e.g., code files, improved CSV handling)
 * [ ] TDB
 
 ## WANT TO CONTRIBUTE!? ##
 _*OH MY GOD, PLEASE DO.*_
 
 Just make a pull request and add whatever you want (or fix whatever you want).  I'll review and approve if everything seems good.
 
-Thanks, everyone!
+Thanks, everyone!
@@ -2,18 +2,26 @@
 Core application that contains the `FileExtractor` class object
 """
 
-# It's good practice to have a logger instance if you use it
 import logging
 import mimetypes
-from io import BytesIO  # Ensure BytesIO is imported from io
+from io import BytesIO
 from pathlib import Path
 from tempfile import SpooledTemporaryFile
 from typing import IO
 
 from docx import Document
 
-# from loguru import logger # Assuming logger is configured if used
+# --- Module-level imports for optional PDF libraries ---
+try:
+    import pymupdf  # type: ignore
+except ImportError:
+    pymupdf = None  # Will be None if not installed
 
+try:
+    import pypdf  # type: ignore
+except ImportError:
+    pypdf = None  # Will be None if not installed
+# --- End of module-level imports ---
 
 logger = logging.getLogger(__name__)
 
@@ -133,11 +141,109 @@ def get_file_type(
             "pdf": "pdf",
             "txt": "plain",
             "csv": "csv",
+            # Add programming language mappings
+            "py": "x-python",
+            "js": "javascript",
+            "java": "x-java-source",
+            "c": "x-c",
+            "cpp": "x-c++",
+            "html": "html",
+            "css": "css",
+            "json": "json",
+            "xml": "xml",
         }
         return ext_to_mime_subtype.get(
             ext, "octet-stream"
         )  # Default to octet-stream
 
+    @staticmethod
+    def is_programming_language_file(file_ext: str) -> bool:
+        """
+        Check if the file extension corresponds to a programming language file.
+
+        Args:
+            file_ext: File extension (without dot)
+
+        Returns:
+            bool: True if it's a programming language file
+        """
+        programming_extensions = {
+            "py",
+            "js",
+            "ts",
+            "java",
+            "cpp",
+            "c",
+            "h",
+            "hpp",
+            "cs",
+            "php",
+            "rb",
+            "go",
+            "rs",
+            "swift",
+            "kt",
+            "scala",
+            "r",
+            "sql",
+            "sh",
+            "bash",
+            "zsh",
+            "ps1",
+            "bat",
+            "cmd",
+            "html",
+            "htm",
+            "css",
+            "scss",
+            "sass",
+            "less",
+            "xml",
+            "json",
+            "yaml",
+            "yml",
+            "toml",
+            "ini",
+            "cfg",
+            "conf",
+            "md",
+            "rst",
+            "tex",
+            "latex",
+            "vue",
+            "jsx",
+            "tsx",
+            "dart",
+            "pl",
+            "pm",
+            "lua",
+            "vim",
+            "asm",
+            "s",
+            "f",
+            "f90",
+            "f95",
+            "cob",
+            "cobol",
+            "pas",
+            "pp",
+            "ml",
+            "fs",
+            "fsx",
+            "elm",
+            "clj",
+            "cljs",
+            "ex",
+            "exs",
+            "erl",
+            "hrl",
+            "jl",
+            "nim",
+            "cr",
+            "zig",
+        }
+        return file_ext.lower() in programming_extensions
+
     def get_contents(self) -> bytes:
         """
         Reads the contents from self.file and returns it as bytes.
@@ -194,6 +300,36 @@ def get_contents(self) -> bytes:
                 f"nor is it a Path or bytes."
             )
 
+    def code_file_read(self) -> str:
+        """
+        Reads contents from programming language files (.py, .js, .java, etc.)
+        with enhanced encoding detection and preserves original formatting.
+
+        Returns:
+            str: The file content as a string
+        """
+        contents_bytes = self.get_contents()
+
+        # Common encodings for source code files
+        encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
+
+        for encoding in encodings_to_try:
+            try:
+                content = contents_bytes.decode(encoding)
+                logger.info(
+                    f"Successfully decoded {self.file_name} using {encoding}"
+                )
+                return content
+            except UnicodeDecodeError:
+                continue
+
+        # If all encodings fail, use utf-8 with replacement
+        logger.warning(
+            f"Could not decode code file {self.file_name} with standard "
+            f"encodings, using utf-8 with replacement characters."
+        )
+        return contents_bytes.decode("utf-8", errors="replace")
+
     def pdf_file_read(self) -> str:  # Added return type hint
         """
         This current code provides a workaround in case MuPDF (a dependency
@@ -203,9 +339,13 @@ def pdf_file_read(self) -> str:  # Added return type hint
         extracted string data, those characters get filtered out.
         """
         contents = self.get_contents()  # This should now reliably return bytes
+        text = ""  # Default to empty string
 
         try:
-            import pymupdf  # type: ignore
+            if not pymupdf:  # Check if module-level import was successful
+                raise ImportError(
+                    "pymupdf module not available or import failed."
+                )
 
             # PyMuPDF's Document constructor can take bytes directly via the
             # 'stream' argument
@@ -220,23 +360,26 @@ def pdf_file_read(self) -> str:  # Added return type hint
                 f" {self.file_name}"
             )
             try:
-                import pypdf  # type: ignore
+                if not pypdf:  # Check if module-level import was successful
+                    raise ImportError(
+                        "pypdf module not available or import failed."
+                    )
 
                 # PyPDF2 needs a stream, so wrap bytes in BytesIO
                 pdf_stream = BytesIO(contents)
                 pdf_reader = pypdf.PdfReader(pdf_stream)
                 raw_text = [
                     page.extract_text()
                     for page in pdf_reader.pages
-                    if page.extract_text()
+                    if page.extract_text()  # Ensure text is not None or empty
                 ]
                 text = "".join(raw_text)
             except Exception as e_pypdf:
                 logger.error(
                     f"Both PyMuPDF and PyPDF2 failed for PDF "
                     f"{self.file_name}: {e_pypdf}"
                 )
-                text = ""  # Return empty string on failure
+                # text remains "" as initialized
         return text
 
     def docx_file_read(self) -> str:  # Added return type hint
 
@@ -37,26 +37,61 @@ def file_load(self):
         Returns:
             str
         """
-        file_type = self.file.file_ext
-        # file_type = file_loc.split('.')[-1]
+        file_type = self.file.file_ext.lower()
 
-        # file_types_tup = ('pdf', 'docx', 'doc', 'txt', 'text')
+        # Primary file extension mapping
         file_ext_matrix = {
             "pdf": "pdf_file_read",
             "docx": "docx_file_read",
             "txt": "text_file_read",
             "text": "text_file_read",
             "csv": "csv_file_read",
         }
+
+        # Check if it's a specific supported format first
         if file_type in file_ext_matrix:
             text = getattr(self.file, file_ext_matrix[file_type])()
             return text
+        # Check if it's a programming language file
+        elif self.file.is_programming_language_file(file_type):
+            logger.info(
+                f"Processing programming language file: {self.file.file_name}"
+            )
+            text = self.file.code_file_read()
+            return text
         else:
-            mime_type = self.file.get_file_type(self.file.file.name)
+            # Fall back to mime type detection
+            mime_type = self.file.get_file_type(self.file.file_name)
+
+            # Check if mime type suggests it's a text-based file
+            text_mime_types = [
+                "plain",
+                "javascript",
+                "x-python",
+                "x-c",
+                "x-java-source",
+                "x-c++",
+                "html",
+                "css",
+                "json",
+                "xml",
+            ]
+
+            if mime_type in text_mime_types:
+                logger.info(
+                    f"Processing text-based file by mime type: {mime_type}"
+                )
+                text = (
+                    self.file.code_file_read()
+                )  # Use code_file_read for better encoding handling
+                return text
 
             logger.error(
                 f"You are using an incorrect file format for file submissions. "
-                f"Please upload a .docx/.doc/.txt/.pdf file OR! Note the "
-                f"mimetype of your submitted data and submit an error report "
-                f"to github with the following: {mime_type}"
+                f"Please upload a .docx/.doc/.txt/.pdf file or a supported "
+                f"programming language file (.py, .js, .java, .cpp, etc.). "
+                f"Note the mimetype of your submitted data and submit an "
+                f"error report to github with the following: {mime_type}"
             )
+
+            return ""
@@ -3,4 +3,5 @@ PyMuPDF
 pypdf
 python-docx
 pytest
-pytest-lazy-fixtures
+pytest-lazy-fixtures
+pytest-mock
@@ -24,8 +24,11 @@ pytest==8.4.0
     # via
     #   -r core_requirements.in
     #   pytest-lazy-fixtures
+    #   pytest-mock
 pytest-lazy-fixtures==1.1.4
     # via -r core_requirements.in
+pytest-mock==3.14.1
+    # via -r core_requirements.in
 python-docx==1.1.2
     # via -r core_requirements.in
 typing-extensions==4.14.0
 
@@ -256,7 +256,7 @@ referencing==0.36.2
     #   jsonschema
     #   jsonschema-specifications
     #   jupyter-events
-requests==2.32.3
+requests==2.32.4
     # via
     #   id
     #   jupyterlab-server
@@ -280,7 +280,7 @@ rpds-py==0.25.1
     # via
     #   jsonschema
     #   referencing
-ruff==0.11.12
+ruff==0.11.13
     # via -r dev_requirements.in
 send2trash==1.8.3
     # via jupyter-server