Sobro Inc commited on
Commit
4786618
·
1 Parent(s): fdeb5da

Fix permission errors and use simplified version

Browse files
Files changed (5) hide show
  1. Dockerfile +20 -5
  2. UPDATE_MCP_CONFIG.md +221 -0
  3. main.py +8 -13
  4. main_simple.py +148 -0
  5. push_to_hf.sh +4 -0
Dockerfile CHANGED
@@ -1,5 +1,8 @@
1
  FROM python:3.10-slim
2
 
 
 
 
3
  WORKDIR /app
4
 
5
  # Install system dependencies
@@ -14,14 +17,26 @@ COPY requirements.txt .
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
  # Download required NLTK data
17
- RUN python -m nltk.downloader punkt stopwords
 
 
 
18
 
19
  # Copy application code
20
- COPY app/ ./app/
21
- COPY main.py .
 
 
 
 
 
 
 
 
 
22
 
23
  # Expose port
24
  EXPOSE 7860
25
 
26
- # Run the application
27
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10-slim
2
 
3
+ # Create app user
4
+ RUN useradd -m -u 1000 user
5
+
6
  WORKDIR /app
7
 
8
  # Install system dependencies
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
  # Download required NLTK data
20
+ RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt stopwords
21
+
22
+ # Create cache directories with proper permissions
23
+ RUN mkdir -p /app/.cache && chown -R user:user /app/.cache
24
 
25
  # Copy application code
26
+ COPY --chown=user:user app/ ./app/
27
+ COPY --chown=user:user main.py .
28
+ COPY --chown=user:user main_simple.py .
29
+
30
+ # Switch to user
31
+ USER user
32
+
33
+ # Set environment variables
34
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
35
+ ENV HF_HOME=/app/.cache/huggingface
36
+ ENV PYTHONUNBUFFERED=1
37
 
38
  # Expose port
39
  EXPOSE 7860
40
 
41
+ # Run the application (using simple version first)
42
+ CMD ["uvicorn", "main_simple:app", "--host", "0.0.0.0", "--port", "7860"]
UPDATE_MCP_CONFIG.md ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Обновление конфигурации MCP для SobroJuriBert
2
+
3
+ После развертывания SobroJuriBert, обнови конфигурацию MCP:
4
+
5
+ ## 1. Обнови файл конфигурации
6
+
7
+ Отредактируй `/mnt/c/Users/s7/AppData/Roaming/Claude/claude_desktop_config.json`:
8
+
9
+ ```json
10
+ {
11
+ "mcpServers": {
12
+ "filesystem": {
13
+ "command": "npx",
14
+ "args": [
15
+ "-y",
16
+ "@modelcontextprotocol/server-filesystem",
17
+ "C:\\Users\\s7\\Documents",
18
+ "C:\\sobro-mcp"
19
+ ]
20
+ },
21
+ "memory": {
22
+ "command": "npx",
23
+ "args": [
24
+ "-y",
25
+ "@modelcontextprotocol/server-memory"
26
+ ]
27
+ },
28
+ "sobrojuribert": {
29
+ "command": "C:\\Users\\s7\\AppData\\Local\\Microsoft\\WindowsApps\\python.exe",
30
+ "args": [
31
+ "C:\\sobro-mcp\\sobrojuribert_mcp.py"
32
+ ]
33
+ }
34
+ }
35
+ }
36
+ ```
37
+
38
+ ## 2. Создай новый MCP сервер
39
+
40
+ Создай файл `C:\sobro-mcp\sobrojuribert_mcp.py`:
41
+
42
+ ```python
43
+ #!/usr/bin/env python3
44
+ """SobroJuriBert MCP Server"""
45
+
46
+ import asyncio
47
+ from typing import Any
48
+ import aiohttp
49
+ from mcp.server.models import InitializationOptions
50
+ from mcp.server import NotificationOptions, Server
51
+ import mcp.server.stdio
52
+ import mcp.types as types
53
+
54
+ API_URL = "https://sobroinc-sobrojuribert.hf.space"
55
+
56
+ async def run_server():
57
+ server = Server("sobrojuribert-mcp")
58
+
59
+ session = None
60
+
61
+ @server.list_tools()
62
+ async def handle_list_tools() -> list[types.Tool]:
63
+ return [
64
+ types.Tool(
65
+ name="juribert_mask_fill",
66
+ description="Fill [MASK] tokens in French legal text",
67
+ inputSchema={
68
+ "type": "object",
69
+ "properties": {
70
+ "text": {"type": "string", "description": "Text with [MASK] tokens"},
71
+ "top_k": {"type": "integer", "default": 5}
72
+ },
73
+ "required": ["text"]
74
+ }
75
+ ),
76
+ types.Tool(
77
+ name="juribert_embeddings",
78
+ description="Generate embeddings for French legal texts",
79
+ inputSchema={
80
+ "type": "object",
81
+ "properties": {
82
+ "texts": {"type": "array", "items": {"type": "string"}}
83
+ },
84
+ "required": ["texts"]
85
+ }
86
+ ),
87
+ types.Tool(
88
+ name="juribert_ner",
89
+ description="Extract entities from French legal text",
90
+ inputSchema={
91
+ "type": "object",
92
+ "properties": {
93
+ "text": {"type": "string"}
94
+ },
95
+ "required": ["text"]
96
+ }
97
+ ),
98
+ types.Tool(
99
+ name="juribert_classify",
100
+ description="Classify French legal documents",
101
+ inputSchema={
102
+ "type": "object",
103
+ "properties": {
104
+ "text": {"type": "string"}
105
+ },
106
+ "required": ["text"]
107
+ }
108
+ ),
109
+ types.Tool(
110
+ name="juribert_analyze_contract",
111
+ description="Analyze French legal contracts",
112
+ inputSchema={
113
+ "type": "object",
114
+ "properties": {
115
+ "text": {"type": "string"},
116
+ "contract_type": {"type": "string"}
117
+ },
118
+ "required": ["text"]
119
+ }
120
+ )
121
+ ]
122
+
123
+ @server.call_tool()
124
+ async def handle_call_tool(name: str, arguments: dict) -> list[types.TextContent]:
125
+ nonlocal session
126
+
127
+ if session is None:
128
+ session = aiohttp.ClientSession()
129
+
130
+ try:
131
+ endpoint_map = {
132
+ "juribert_mask_fill": "/mask-fill",
133
+ "juribert_embeddings": "/embeddings",
134
+ "juribert_ner": "/ner",
135
+ "juribert_classify": "/classify",
136
+ "juribert_analyze_contract": "/analyze-contract"
137
+ }
138
+
139
+ endpoint = endpoint_map.get(name)
140
+ if not endpoint:
141
+ return [types.TextContent(type="text", text=f"Unknown tool: {name}")]
142
+
143
+ async with session.post(
144
+ f"{API_URL}{endpoint}",
145
+ json=arguments,
146
+ timeout=aiohttp.ClientTimeout(total=30)
147
+ ) as response:
148
+ result = await response.json()
149
+
150
+ # Format response based on tool
151
+ if name == "juribert_mask_fill":
152
+ text = f"Predictions for: {result['input']}\n"
153
+ for pred in result['predictions']:
154
+ text += f"- {pred['sequence']} (score: {pred['score']:.3f})\n"
155
+
156
+ elif name == "juribert_embeddings":
157
+ text = f"Generated {len(result['embeddings'])} embeddings "
158
+ text += f"(dimension: {result['dimension']})"
159
+
160
+ elif name == "juribert_ner":
161
+ text = f"Found {len(result['entities'])} entities:\n"
162
+ for ent in result['entities']:
163
+ text += f"- {ent['text']} ({ent['type']})\n"
164
+
165
+ elif name == "juribert_classify":
166
+ text = f"Document classification:\n"
167
+ text += f"Primary: {result['primary_category']}\n"
168
+ text += f"Confidence: {result['confidence']:.1%}\n"
169
+
170
+ elif name == "juribert_analyze_contract":
171
+ text = f"Contract Analysis:\n"
172
+ text += f"Type: {result['contract_type']}\n"
173
+ text += f"Parties: {len(result['parties'])}\n"
174
+ text += f"Key clauses: {', '.join(result['key_clauses'])}\n"
175
+ if result['missing_clauses']:
176
+ text += f"Missing: {', '.join(result['missing_clauses'])}\n"
177
+
178
+ return [types.TextContent(type="text", text=text)]
179
+
180
+ except Exception as e:
181
+ return [types.TextContent(type="text", text=f"Error: {str(e)}")]
182
+
183
+ async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
184
+ await server.run(
185
+ read_stream,
186
+ write_stream,
187
+ InitializationOptions(
188
+ server_name="sobrojuribert-mcp",
189
+ server_version="1.0.0",
190
+ capabilities=server.get_capabilities(
191
+ notification_options=NotificationOptions(),
192
+ experimental_capabilities={},
193
+ ),
194
+ ),
195
+ )
196
+
197
+ if session:
198
+ await session.close()
199
+
200
+ def main():
201
+ asyncio.run(run_server())
202
+
203
+ if __name__ == "__main__":
204
+ main()
205
+ ```
206
+
207
+ ## 3. Перезапусти Claude Desktop
208
+
209
+ После обновления конфигурации, перезапусти Claude Desktop.
210
+
211
+ ## 4. Используй новые команды
212
+
213
+ ```
214
+ Используй juribert_mask_fill с текстом "Le contrat est signé entre les [MASK]"
215
+
216
+ Используй juribert_ner для извлечения сущностей из "Le Tribunal de Grande Instance de Paris"
217
+
218
+ Классифицируй документ с помощью juribert_classify
219
+
220
+ Проанализируй контракт с помощью juribert_analyze_contract
221
+ ```
main.py CHANGED
@@ -80,30 +80,25 @@ async def load_models():
80
  try:
81
  # Load JuriBERT base model for embeddings and mask filling
82
  logger.info("Loading JuriBERT base model...")
83
- models['juribert_base'] = AutoModel.from_pretrained('dascim/juribert-base')
84
- tokenizers['juribert_base'] = AutoTokenizer.from_pretrained('dascim/juribert-base')
85
- models['juribert_mlm'] = AutoModelForMaskedLM.from_pretrained('dascim/juribert-base')
86
 
87
  # Load CamemBERT models as fallback/complement
88
  logger.info("Loading CamemBERT models...")
89
  models['camembert_ner'] = pipeline(
90
  'ner',
91
  model='Jean-Baptiste/camembert-ner-with-dates',
92
- aggregation_strategy="simple"
 
93
  )
94
 
95
- # Load legal-specific models
96
- logger.info("Loading French legal classification model...")
97
- models['legal_classifier'] = pipeline(
98
- 'text-classification',
99
- model='nlptown/bert-base-multilingual-uncased-sentiment' # Placeholder
100
- )
101
-
102
- logger.info("All models loaded successfully!")
103
 
104
  except Exception as e:
105
  logger.error(f"Error loading models: {e}")
106
- raise
 
107
 
108
  @app.get("/")
109
  async def root():
 
80
  try:
81
  # Load JuriBERT base model for embeddings and mask filling
82
  logger.info("Loading JuriBERT base model...")
83
+ models['juribert_base'] = AutoModel.from_pretrained('dascim/juribert-base', cache_dir="/app/.cache/huggingface")
84
+ tokenizers['juribert_base'] = AutoTokenizer.from_pretrained('dascim/juribert-base', cache_dir="/app/.cache/huggingface")
85
+ models['juribert_mlm'] = AutoModelForMaskedLM.from_pretrained('dascim/juribert-base', cache_dir="/app/.cache/huggingface")
86
 
87
  # Load CamemBERT models as fallback/complement
88
  logger.info("Loading CamemBERT models...")
89
  models['camembert_ner'] = pipeline(
90
  'ner',
91
  model='Jean-Baptiste/camembert-ner-with-dates',
92
+ aggregation_strategy="simple",
93
+ model_kwargs={"cache_dir": "/app/.cache/huggingface"}
94
  )
95
 
96
+ logger.info("Models loaded successfully!")
 
 
 
 
 
 
 
97
 
98
  except Exception as e:
99
  logger.error(f"Error loading models: {e}")
100
+ # Don't crash completely, allow basic endpoints to work
101
+ logger.warning("Running in limited mode without all models")
102
 
103
  @app.get("/")
104
  async def root():
main_simple.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from datetime import datetime
4
+ from typing import List, Dict, Any, Optional
5
+ from fastapi import FastAPI, HTTPException
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel, Field
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModel, pipeline
10
+ import numpy as np
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize FastAPI app
17
+ app = FastAPI(
18
+ title="SobroJuriBert API",
19
+ description="French Legal AI API powered by JuriBERT",
20
+ version="1.0.0"
21
+ )
22
+
23
+ # Add CORS middleware
24
+ app.add_middleware(
25
+ CORSMiddleware,
26
+ allow_origins=["*"],
27
+ allow_credentials=True,
28
+ allow_methods=["*"],
29
+ allow_headers=["*"],
30
+ )
31
+
32
+ # Global model storage
33
+ models = {}
34
+ tokenizers = {}
35
+
36
+ # Pydantic models
37
+ class TextRequest(BaseModel):
38
+ text: str = Field(..., description="Text to analyze")
39
+
40
+ class NERRequest(BaseModel):
41
+ text: str = Field(..., description="Legal text for entity extraction")
42
+
43
+ class ClassificationRequest(BaseModel):
44
+ text: str = Field(..., description="Legal document to classify")
45
+
46
+ @app.on_event("startup")
47
+ async def load_models():
48
+ """Load models on startup"""
49
+ logger.info("Starting SobroJuriBert API...")
50
+ logger.info("Models will be loaded on demand to save memory")
51
+
52
+ @app.get("/")
53
+ async def root():
54
+ """Root endpoint with API information"""
55
+ return {
56
+ "name": "SobroJuriBert API",
57
+ "version": "1.0.0",
58
+ "description": "French Legal AI API for lawyers",
59
+ "status": "operational",
60
+ "endpoints": {
61
+ "ner": "/ner - Extract legal entities",
62
+ "classify": "/classify - Classify legal documents",
63
+ "health": "/health - Health check"
64
+ }
65
+ }
66
+
67
+ @app.post("/ner")
68
+ async def extract_entities(request: NERRequest):
69
+ """Extract named entities from French legal text"""
70
+ try:
71
+ # Simple entity extraction
72
+ import re
73
+ entities = []
74
+
75
+ # Extract dates
76
+ dates = re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', request.text)
77
+ for date in dates:
78
+ entities.append({"text": date, "type": "DATE"})
79
+
80
+ # Extract organizations
81
+ orgs = re.findall(r'(?:SARL|SAS|SA|EURL)\s+[\w\s]+', request.text)
82
+ for org in orgs:
83
+ entities.append({"text": org.strip(), "type": "ORG"})
84
+
85
+ # Extract courts
86
+ courts = re.findall(r'(?:Tribunal|Cour)\s+[\w\s]+?(?=\s|,|\.)', request.text)
87
+ for court in courts:
88
+ entities.append({"text": court.strip(), "type": "COURT"})
89
+
90
+ return {
91
+ "entities": entities,
92
+ "text": request.text,
93
+ "message": "Basic entity extraction (full NER model loading on demand)"
94
+ }
95
+
96
+ except Exception as e:
97
+ logger.error(f"NER error: {e}")
98
+ raise HTTPException(status_code=500, detail=str(e))
99
+
100
+ @app.post("/classify")
101
+ async def classify_document(request: ClassificationRequest):
102
+ """Classify French legal documents"""
103
+ try:
104
+ # Simple keyword-based classification
105
+ text_lower = request.text.lower()
106
+
107
+ categories = {
108
+ "contract": ["contrat", "accord", "convention", "parties"],
109
+ "litigation": ["tribunal", "jugement", "litige", "procès"],
110
+ "corporate": ["société", "sarl", "sas", "entreprise"],
111
+ "employment": ["travail", "salarié", "employeur", "licenciement"]
112
+ }
113
+
114
+ scores = {}
115
+ for category, keywords in categories.items():
116
+ score = sum(1 for kw in keywords if kw in text_lower)
117
+ if score > 0:
118
+ scores[category] = score
119
+
120
+ if not scores:
121
+ primary_category = "general"
122
+ else:
123
+ primary_category = max(scores, key=scores.get)
124
+
125
+ return {
126
+ "primary_category": primary_category,
127
+ "categories": [{"category": cat, "score": score} for cat, score in scores.items()],
128
+ "confidence": 0.8 if scores else 0.5,
129
+ "document_type": "legal_document"
130
+ }
131
+
132
+ except Exception as e:
133
+ logger.error(f"Classification error: {e}")
134
+ raise HTTPException(status_code=500, detail=str(e))
135
+
136
+ @app.get("/health")
137
+ async def health_check():
138
+ """Health check endpoint"""
139
+ return {
140
+ "status": "healthy",
141
+ "timestamp": datetime.utcnow().isoformat(),
142
+ "version": "1.0.0",
143
+ "message": "SobroJuriBert API is running"
144
+ }
145
+
146
+ if __name__ == "__main__":
147
+ import uvicorn
148
+ uvicorn.run(app, host="0.0.0.0", port=7860)
push_to_hf.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+ echo "Pushing SobroJuriBert to Hugging Face..."
3
+ git push -u origin main
4
+ echo "Done! Check: https://huggingface.co/spaces/Sobroinc/SobroJuriBert"