Spaces:
Sleeping
Sleeping
import pdfplumber | |
from docx import Document | |
from openpyxl import load_workbook | |
import pdfplumber | |
import logging | |
from typing import List, Union, Tuple | |
import os | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def extract_pdf_content(pdf_path: str) -> List[str]: | |
""" | |
Extract text and tables from PDF in their natural reading order. | |
Simplified version without positional processing. | |
Args: | |
pdf_path (str): Path to the PDF file | |
Returns: | |
List[str]: List of extracted content chunks (text and tables) | |
""" | |
if not os.path.exists(pdf_path): | |
logger.error(f"PDF file not found: {pdf_path}") | |
return [] | |
try: | |
with pdfplumber.open(pdf_path) as pdf: | |
content = [] | |
for page in pdf.pages: | |
# First extract tables | |
tables = page.extract_tables() | |
for table in tables: | |
if table: | |
# Convert table to string representation | |
table_str = "\n".join( | |
["\t".join(str(cell) for cell in row) for row in table] | |
) | |
content.append(f"[TABLE]\n{table_str}\n[/TABLE]") | |
# Then extract regular text | |
text = page.extract_text() | |
if text and text.strip(): | |
content.append(text.strip()) | |
logger.info(f"Successfully extracted content from {pdf_path}") | |
return content | |
except Exception as e: | |
logger.error(f"Error processing {pdf_path}: {str(e)}") | |
return [] | |
from docx import Document | |
from typing import List | |
import os | |
def extract_docx_content(docx_path: str) -> List[str]: | |
""" | |
Extract text and tables from DOCX file with clear table markers. | |
Args: | |
docx_path (str): Path to the DOCX file | |
Returns: | |
List[str]: List of extracted content chunks with tables marked as [TABLE]...[/TABLE] | |
""" | |
if not os.path.exists(docx_path): | |
raise FileNotFoundError(f"DOCX file not found: {docx_path}") | |
doc = Document(docx_path) | |
content = [] | |
# Process all paragraphs first | |
for paragraph in doc.paragraphs: | |
text = paragraph.text.strip() | |
if text: | |
content.append(text) | |
# Process all tables after paragraphs | |
for table in doc.tables: | |
table_str = "\n".join( | |
["\t".join(cell.text.strip() for cell in row.cells) | |
for row in table.rows] | |
) | |
if table_str.strip(): | |
content.append(f"[TABLE]\n{table_str}\n[/TABLE]") | |
return content | |
def extract_xlsx_content(file_path: str): | |
wb = load_workbook(file_path) | |
sheets_text = [] | |
for sheet in wb: | |
sheet_str = f"--- Sheet: {sheet.title} ---\n" | |
for row in sheet.iter_rows(): | |
row_str = "\t".join(str(cell.value) if cell.value else "" for cell in row) | |
sheet_str += row_str + "\n" | |
sheets_text.append(sheet_str.strip()) | |
return sheets_text |