Spaces:

marcosremar2
/

mineru2

Running

File size: 9,441 Bytes

550ec39

#!/usr/bin/env python3
"""
PDF to Markdown Converter using MinerU (vendor/mineru)
This is the main conversion script that uses the local MinerU installation
"""

import os
import sys
import logging
import argparse
from pathlib import Path
import subprocess

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('pdf_converter.log')
    ]
)
logger = logging.getLogger(__name__)


class PdfConverterResult:
    """Class representing the result of a PDF conversion"""
    
    def __init__(self, pdf_path: str, success: bool, md_path: str = None, 
                 time_taken: float = 0, error: str = None):
        self.pdf_path = pdf_path
        self.success = success
        self.md_path = md_path
        self.time_taken = time_taken
        self.error = error
    
    def __str__(self):
        if self.success:
            return f"✅ Successfully converted {self.pdf_path} in {self.time_taken:.2f}s"
        else:
            return f"❌ Failed to convert {self.pdf_path}: {self.error}"


class MineruPdfConverter:
    """
    PDF to Markdown converter using MinerU
    """
    
    def __init__(self, output_dir: str = "output"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult:
        """Convert a single PDF file to Markdown using MinerU"""
        import time
        start_time = time.time()
        
        try:
            pdf_path = Path(pdf_path)
            if not pdf_path.exists():
                return PdfConverterResult(
                    str(pdf_path), False, error=f"File not found: {pdf_path}"
                )
            
            logger.info(f"Processing: {pdf_path}")
            
            # Prepare output directory
            pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem)
            
            # Run MinerU command
            cmd = [
                "mineru",
                "-p", str(pdf_path),
                "-o", pdf_output_dir,
                "-m", "txt",  # Use text mode
                "-f", "false",  # Disable formula parsing for speed
                "-t", "false",  # Disable table parsing for speed
            ]
            
            logger.info(f"Running command: {' '.join(cmd)}")
            
            # Execute MinerU
            result = subprocess.run(cmd, capture_output=True, text=True)
            
            if result.returncode != 0:
                error_msg = result.stderr if result.stderr else "Unknown error"
                return PdfConverterResult(
                    str(pdf_path), False, error=error_msg
                )
            
            # Find the generated markdown file
            md_path = None
            expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md"
            
            if expected_md.exists():
                md_path = str(expected_md)
                logger.info(f"✅ Markdown file created: {md_path}")
            else:
                # Search for any .md file in the output directory
                for md_file in Path(pdf_output_dir).rglob("*.md"):
                    md_path = str(md_file)
                    logger.info(f"✅ Found markdown file: {md_path}")
                    break
            
            if not md_path:
                return PdfConverterResult(
                    str(pdf_path), False, error="No markdown file generated"
                )
            
            # Delete original PDF if requested
            if delete_after and pdf_path.exists():
                pdf_path.unlink()
                logger.info(f"🗑️  Deleted original PDF: {pdf_path}")
            
            elapsed_time = time.time() - start_time
            
            return PdfConverterResult(
                str(pdf_path), True, md_path=md_path, time_taken=elapsed_time
            )
            
        except Exception as e:
            logger.error(f"Error processing {pdf_path}: {e}")
            import traceback
            traceback.print_exc()
            
            return PdfConverterResult(
                str(pdf_path), False, error=str(e)
            )


class BatchProcessor:
    """Process multiple PDF files in batch"""
    
    def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output", 
                 workers: int = 1, delete_after: bool = False):
        self.batch_dir = batch_dir
        self.output_dir = output_dir
        self.workers = workers
        self.delete_after = delete_after
        self.converter = MineruPdfConverter(output_dir)
    
    def find_pdf_files(self) -> list[Path]:
        """Find all PDF files in the batch directory"""
        pdf_files = []
        batch_path = Path(self.batch_dir)
        
        if not batch_path.exists():
            logger.warning(f"Batch directory not found: {self.batch_dir}")
            return pdf_files
        
        # Find all PDFs recursively
        pdf_files = list(batch_path.rglob("*.pdf"))
        logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}")
        
        return pdf_files
    
    def process_batch(self) -> tuple[int, int]:
        """Process all PDFs in the batch directory"""
        pdf_files = self.find_pdf_files()
        
        if not pdf_files:
            logger.info("No PDF files found to process")
            return 0, 0
        
        successful = 0
        failed = 0
        
        logger.info(f"Starting batch processing of {len(pdf_files)} files...")
        
        # Process files sequentially (MinerU already handles parallelism internally)
        for pdf_file in pdf_files:
            result = self.converter.convert_file(str(pdf_file), self.delete_after)
            
            if result.success:
                successful += 1
                logger.info(f"✅ {result}")
            else:
                failed += 1
                logger.error(f"❌ {result}")
        
        return successful, failed


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(
        description="Convert PDF files to Markdown using MinerU",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Convert a single PDF
  %(prog)s convert path/to/file.pdf
  
  # Batch convert all PDFs in batch-files directory
  %(prog)s batch
  
  # Batch convert with custom settings
  %(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4
  
  # Delete PDFs after successful conversion
  %(prog)s batch --delete-after
        """
    )
    
    subparsers = parser.add_subparsers(dest='command', help='Command to run')
    
    # Convert command
    convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file')
    convert_parser.add_argument('pdf_file', help='Path to PDF file')
    convert_parser.add_argument('--output-dir', default='output', help='Output directory')
    convert_parser.add_argument('--delete-after', action='store_true', 
                              help='Delete PDF after successful conversion')
    
    # Batch command
    batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files')
    batch_parser.add_argument('--batch-dir', default='batch-files', 
                            help='Directory containing PDF files')
    batch_parser.add_argument('--output-dir', default='output', 
                            help='Output directory')
    batch_parser.add_argument('--workers', type=int, default=1, 
                            help='Number of parallel workers')
    batch_parser.add_argument('--delete-after', action='store_true', 
                            help='Delete PDFs after successful conversion')
    
    args = parser.parse_args()
    
    # Auto-detect command if none specified
    if not args.command:
        # If first argument looks like a file, assume convert command
        if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()):
            args.command = 'convert'
            args.pdf_file = sys.argv[1]
            args.output_dir = 'output'
            args.delete_after = False
        else:
            # Default to batch mode
            args.command = 'batch'
            args.batch_dir = 'batch-files'
            args.output_dir = 'output'
            args.workers = 1
            args.delete_after = False
    
    # Execute command
    if args.command == 'convert':
        converter = MineruPdfConverter(args.output_dir)
        result = converter.convert_file(args.pdf_file, args.delete_after)
        print(result)
        sys.exit(0 if result.success else 1)
    
    elif args.command == 'batch':
        processor = BatchProcessor(
            args.batch_dir, 
            args.output_dir, 
            args.workers, 
            args.delete_after
        )
        successful, failed = processor.process_batch()
        
        print(f"\n📊 Batch processing complete:")
        print(f"   ✅ Successful: {successful}")
        print(f"   ❌ Failed: {failed}")
        print(f"   📁 Output directory: {args.output_dir}")
        
        sys.exit(0 if failed == 0 else 1)
    
    else:
        parser.print_help()
        sys.exit(1)


if __name__ == "__main__":
    main()