Spaces:

Sobroinc
/

SobroJuriBert

Sleeping

App Files Files Community

Sobro Inc commited on 30 days ago

Commit

4786618

1 Parent(s): fdeb5da

Fix permission errors and use simplified version

Browse files

Files changed (5) hide show

Dockerfile +20 -5
UPDATE_MCP_CONFIG.md +221 -0
main.py +8 -13
main_simple.py +148 -0
push_to_hf.sh +4 -0

Dockerfile CHANGED Viewed

@@ -1,5 +1,8 @@
 FROM python:3.10-slim
 WORKDIR /app
 # Install system dependencies
@@ -14,14 +17,26 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Download required NLTK data
-RUN python -m nltk.downloader punkt stopwords
 # Copy application code
-COPY app/ ./app/
-COPY main.py .
 # Expose port
 EXPOSE 7860
-# Run the application
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-slim
+# Create app user
+RUN useradd -m -u 1000 user
 WORKDIR /app
 # Install system dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Download required NLTK data
+RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt stopwords
+# Create cache directories with proper permissions
+RUN mkdir -p /app/.cache && chown -R user:user /app/.cache
 # Copy application code
+COPY --chown=user:user app/ ./app/
+COPY --chown=user:user main.py .
+COPY --chown=user:user main_simple.py .
+# Switch to user
+USER user
+# Set environment variables
+ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
+ENV HF_HOME=/app/.cache/huggingface
+ENV PYTHONUNBUFFERED=1
 # Expose port
 EXPOSE 7860
+# Run the application (using simple version first)
+CMD ["uvicorn", "main_simple:app", "--host", "0.0.0.0", "--port", "7860"]

UPDATE_MCP_CONFIG.md ADDED Viewed

	@@ -0,0 +1,221 @@

+# Обновление конфигурации MCP для SobroJuriBert
+После развертывания SobroJuriBert, обнови конфигурацию MCP:
+## 1. Обнови файл конфигурации
+Отредактируй `/mnt/c/Users/s7/AppData/Roaming/Claude/claude_desktop_config.json`:
+```json
+{
+  "mcpServers": {
+    "filesystem": {
+      "command": "npx",
+      "args": [
+        "-y",
+        "@modelcontextprotocol/server-filesystem",
+        "C:\\Users\\s7\\Documents",
+        "C:\\sobro-mcp"
+      ]
+    },
+    "memory": {
+      "command": "npx",
+      "args": [
+        "-y",
+        "@modelcontextprotocol/server-memory"
+      ]
+    },
+    "sobrojuribert": {
+      "command": "C:\\Users\\s7\\AppData\\Local\\Microsoft\\WindowsApps\\python.exe",
+      "args": [
+        "C:\\sobro-mcp\\sobrojuribert_mcp.py"
+      ]
+    }
+  }
+}
+```
+## 2. Создай новый MCP сервер
+Создай файл `C:\sobro-mcp\sobrojuribert_mcp.py`:
+```python
+#!/usr/bin/env python3
+"""SobroJuriBert MCP Server"""
+import asyncio
+from typing import Any
+import aiohttp
+from mcp.server.models import InitializationOptions
+from mcp.server import NotificationOptions, Server
+import mcp.server.stdio
+import mcp.types as types
+API_URL = "https://sobroinc-sobrojuribert.hf.space"
+async def run_server():
+    server = Server("sobrojuribert-mcp")
+    session = None
+    @server.list_tools()
+    async def handle_list_tools() -> list[types.Tool]:
+        return [
+            types.Tool(
+                name="juribert_mask_fill",
+                description="Fill [MASK] tokens in French legal text",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "text": {"type": "string", "description": "Text with [MASK] tokens"},
+                        "top_k": {"type": "integer", "default": 5}
+                    },
+                    "required": ["text"]
+                }
+            ),
+            types.Tool(
+                name="juribert_embeddings",
+                description="Generate embeddings for French legal texts",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "texts": {"type": "array", "items": {"type": "string"}}
+                    },
+                    "required": ["texts"]
+                }
+            ),
+            types.Tool(
+                name="juribert_ner",
+                description="Extract entities from French legal text",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "text": {"type": "string"}
+                    },
+                    "required": ["text"]
+                }
+            ),
+            types.Tool(
+                name="juribert_classify",
+                description="Classify French legal documents",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "text": {"type": "string"}
+                    },
+                    "required": ["text"]
+                }
+            ),
+            types.Tool(
+                name="juribert_analyze_contract",
+                description="Analyze French legal contracts",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "text": {"type": "string"},
+                        "contract_type": {"type": "string"}
+                    },
+                    "required": ["text"]
+                }
+            )
+        ]
+    @server.call_tool()
+    async def handle_call_tool(name: str, arguments: dict) -> list[types.TextContent]:
+        nonlocal session
+        if session is None:
+            session = aiohttp.ClientSession()
+        try:
+            endpoint_map = {
+                "juribert_mask_fill": "/mask-fill",
+                "juribert_embeddings": "/embeddings",
+                "juribert_ner": "/ner",
+                "juribert_classify": "/classify",
+                "juribert_analyze_contract": "/analyze-contract"
+            }
+            endpoint = endpoint_map.get(name)
+            if not endpoint:
+                return [types.TextContent(type="text", text=f"Unknown tool: {name}")]
+            async with session.post(
+                f"{API_URL}{endpoint}",
+                json=arguments,
+                timeout=aiohttp.ClientTimeout(total=30)
+            ) as response:
+                result = await response.json()
+                # Format response based on tool
+                if name == "juribert_mask_fill":
+                    text = f"Predictions for: {result['input']}\n"
+                    for pred in result['predictions']:
+                        text += f"- {pred['sequence']} (score: {pred['score']:.3f})\n"
+                elif name == "juribert_embeddings":
+                    text = f"Generated {len(result['embeddings'])} embeddings "
+                    text += f"(dimension: {result['dimension']})"
+                elif name == "juribert_ner":
+                    text = f"Found {len(result['entities'])} entities:\n"
+                    for ent in result['entities']:
+                        text += f"- {ent['text']} ({ent['type']})\n"
+                elif name == "juribert_classify":
+                    text = f"Document classification:\n"
+                    text += f"Primary: {result['primary_category']}\n"
+                    text += f"Confidence: {result['confidence']:.1%}\n"
+                elif name == "juribert_analyze_contract":
+                    text = f"Contract Analysis:\n"
+                    text += f"Type: {result['contract_type']}\n"
+                    text += f"Parties: {len(result['parties'])}\n"
+                    text += f"Key clauses: {', '.join(result['key_clauses'])}\n"
+                    if result['missing_clauses']:
+                        text += f"Missing: {', '.join(result['missing_clauses'])}\n"
+                return [types.TextContent(type="text", text=text)]
+        except Exception as e:
+            return [types.TextContent(type="text", text=f"Error: {str(e)}")]
+    async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
+        await server.run(
+            read_stream,
+            write_stream,
+            InitializationOptions(
+                server_name="sobrojuribert-mcp",
+                server_version="1.0.0",
+                capabilities=server.get_capabilities(
+                    notification_options=NotificationOptions(),
+                    experimental_capabilities={},
+                ),
+            ),
+        )
+    if session:
+        await session.close()
+def main():
+    asyncio.run(run_server())
+if __name__ == "__main__":
+    main()
+```
+## 3. Перезапусти Claude Desktop
+После обновления конфигурации, перезапусти Claude Desktop.
+## 4. Используй новые команды
+```
+Используй juribert_mask_fill с текстом "Le contrat est signé entre les [MASK]"
+Используй juribert_ner для извлечения сущностей из "Le Tribunal de Grande Instance de Paris"
+Классифицируй документ с помощью juribert_classify
+Проанализируй контракт с помощью juribert_analyze_contract
+```

main.py CHANGED Viewed

@@ -80,30 +80,25 @@ async def load_models():
     try:
         # Load JuriBERT base model for embeddings and mask filling
         logger.info("Loading JuriBERT base model...")
-        models['juribert_base'] = AutoModel.from_pretrained('dascim/juribert-base')
-        tokenizers['juribert_base'] = AutoTokenizer.from_pretrained('dascim/juribert-base')
-        models['juribert_mlm'] = AutoModelForMaskedLM.from_pretrained('dascim/juribert-base')
         # Load CamemBERT models as fallback/complement
         logger.info("Loading CamemBERT models...")
         models['camembert_ner'] = pipeline(
             'ner',
             model='Jean-Baptiste/camembert-ner-with-dates',
-            aggregation_strategy="simple"
         )
-        # Load legal-specific models
-        logger.info("Loading French legal classification model...")
-        models['legal_classifier'] = pipeline(
-            'text-classification',
-            model='nlptown/bert-base-multilingual-uncased-sentiment'  # Placeholder
-        )
-        logger.info("All models loaded successfully!")
     except Exception as e:
         logger.error(f"Error loading models: {e}")
-        raise
 @app.get("/")
 async def root():

     try:
         # Load JuriBERT base model for embeddings and mask filling
         logger.info("Loading JuriBERT base model...")
+        models['juribert_base'] = AutoModel.from_pretrained('dascim/juribert-base', cache_dir="/app/.cache/huggingface")
+        tokenizers['juribert_base'] = AutoTokenizer.from_pretrained('dascim/juribert-base', cache_dir="/app/.cache/huggingface")
+        models['juribert_mlm'] = AutoModelForMaskedLM.from_pretrained('dascim/juribert-base', cache_dir="/app/.cache/huggingface")
         # Load CamemBERT models as fallback/complement
         logger.info("Loading CamemBERT models...")
         models['camembert_ner'] = pipeline(
             'ner',
             model='Jean-Baptiste/camembert-ner-with-dates',
+            aggregation_strategy="simple",
+            model_kwargs={"cache_dir": "/app/.cache/huggingface"}
         )
+        logger.info("Models loaded successfully!")
     except Exception as e:
         logger.error(f"Error loading models: {e}")
+        # Don't crash completely, allow basic endpoints to work
+        logger.warning("Running in limited mode without all models")
 @app.get("/")
 async def root():

main_simple.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import logging
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import torch
+from transformers import AutoTokenizer, AutoModel, pipeline
+import numpy as np
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="SobroJuriBert API",
+    description="French Legal AI API powered by JuriBERT",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global model storage
+models = {}
+tokenizers = {}
+# Pydantic models
+class TextRequest(BaseModel):
+    text: str = Field(..., description="Text to analyze")
+class NERRequest(BaseModel):
+    text: str = Field(..., description="Legal text for entity extraction")
+class ClassificationRequest(BaseModel):
+    text: str = Field(..., description="Legal document to classify")
+@app.on_event("startup")
+async def load_models():
+    """Load models on startup"""
+    logger.info("Starting SobroJuriBert API...")
+    logger.info("Models will be loaded on demand to save memory")
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "name": "SobroJuriBert API",
+        "version": "1.0.0",
+        "description": "French Legal AI API for lawyers",
+        "status": "operational",
+        "endpoints": {
+            "ner": "/ner - Extract legal entities",
+            "classify": "/classify - Classify legal documents",
+            "health": "/health - Health check"
+        }
+    }
+@app.post("/ner")
+async def extract_entities(request: NERRequest):
+    """Extract named entities from French legal text"""
+    try:
+        # Simple entity extraction
+        import re
+        entities = []
+        # Extract dates
+        dates = re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', request.text)
+        for date in dates:
+            entities.append({"text": date, "type": "DATE"})
+        # Extract organizations
+        orgs = re.findall(r'(?:SARL|SAS|SA|EURL)\s+[\w\s]+', request.text)
+        for org in orgs:
+            entities.append({"text": org.strip(), "type": "ORG"})
+        # Extract courts
+        courts = re.findall(r'(?:Tribunal|Cour)\s+[\w\s]+?(?=\s|,|\.)', request.text)
+        for court in courts:
+            entities.append({"text": court.strip(), "type": "COURT"})
+        return {
+            "entities": entities,
+            "text": request.text,
+            "message": "Basic entity extraction (full NER model loading on demand)"
+        }
+    except Exception as e:
+        logger.error(f"NER error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/classify")
+async def classify_document(request: ClassificationRequest):
+    """Classify French legal documents"""
+    try:
+        # Simple keyword-based classification
+        text_lower = request.text.lower()
+        categories = {
+            "contract": ["contrat", "accord", "convention", "parties"],
+            "litigation": ["tribunal", "jugement", "litige", "procès"],
+            "corporate": ["société", "sarl", "sas", "entreprise"],
+            "employment": ["travail", "salarié", "employeur", "licenciement"]
+        }
+        scores = {}
+        for category, keywords in categories.items():
+            score = sum(1 for kw in keywords if kw in text_lower)
+            if score > 0:
+                scores[category] = score
+        if not scores:
+            primary_category = "general"
+        else:
+            primary_category = max(scores, key=scores.get)
+        return {
+            "primary_category": primary_category,
+            "categories": [{"category": cat, "score": score} for cat, score in scores.items()],
+            "confidence": 0.8 if scores else 0.5,
+            "document_type": "legal_document"
+        }
+    except Exception as e:
+        logger.error(f"Classification error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "timestamp": datetime.utcnow().isoformat(),
+        "version": "1.0.0",
+        "message": "SobroJuriBert API is running"
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

push_to_hf.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+echo "Pushing SobroJuriBert to Hugging Face..."
+git push -u origin main
+echo "Done! Check: https://huggingface.co/spaces/Sobroinc/SobroJuriBert"