dylanebert commited on
Commit
be9d670
·
1 Parent(s): 262aca8

remove backend dependency

Browse files
Files changed (3) hide show
  1. README.md +47 -3
  2. app.py +406 -125
  3. requirements.txt +8 -1
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Research Tracker Mcp
3
- emoji: 🏢
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
@@ -9,4 +9,48 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Research Tracker MCP
3
+ emoji: 🔬
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
 
9
  pinned: false
10
  ---
11
 
12
+ # Research Tracker MCP Server
13
+
14
+ A clean, simple MCP server that provides research inference utilities with no external dependencies. Self-contained server that extracts research metadata from paper URLs, repository links, or research names using embedded inference logic.
15
+
16
+ ## Features
17
+
18
+ - **Author inference** from papers and repositories
19
+ - **Cross-platform resource discovery** (papers, code, models, datasets)
20
+ - **Research metadata extraction** (names, dates, licenses, organizations)
21
+ - **URL classification** and relationship mapping
22
+ - **Comprehensive research ecosystem analysis**
23
+
24
+ ## Available MCP Tools
25
+
26
+ All functions are optimized for MCP usage with clear type hints and docstrings:
27
+
28
+ - `infer_authors` - Extract author names from papers and repositories
29
+ - `infer_paper_url` - Find associated research paper URLs
30
+ - `infer_code_repository` - Discover code repository links
31
+ - `infer_research_name` - Extract research project names
32
+ - `classify_research_url` - Classify URL types (paper/code/model/etc.)
33
+ - `infer_organizations` - Identify affiliated organizations
34
+ - `infer_publication_date` - Extract publication dates
35
+ - `infer_model` - Find associated HuggingFace models
36
+ - `infer_dataset` - Find associated HuggingFace datasets
37
+ - `infer_space` - Find associated HuggingFace spaces
38
+ - `infer_license` - Extract license information
39
+ - `find_research_relationships` - Comprehensive research ecosystem analysis
40
+
41
+ ## Input Support
42
+
43
+ - arXiv paper URLs (https://arxiv.org/abs/...)
44
+ - GitHub repository URLs (https://github.com/...)
45
+ - HuggingFace model/dataset/space URLs
46
+ - Research paper titles and project names
47
+ - Project page URLs
48
+
49
+ ## Environment Variables
50
+
51
+ - `HF_TOKEN` - Hugging Face API token (required)
52
+ - `GITHUB_AUTH` - GitHub API token (optional, enables enhanced GitHub integration)
53
+
54
+ ## Usage
55
+
56
+ The server automatically launches as an MCP server when run. All inference functions are exposed as MCP tools for seamless integration with Claude and other AI assistants.
app.py CHANGED
@@ -3,7 +3,7 @@ Research Tracker MCP Server
3
 
4
  A clean, simple MCP server that provides research inference utilities.
5
  Exposes functions to infer research metadata from paper URLs, repository links,
6
- or research names using the research-tracker-backend inference engine.
7
 
8
  Key Features:
9
  - Author inference from papers and repositories
@@ -16,10 +16,17 @@ All functions are optimized for MCP usage with clear type hints and docstrings.
16
  """
17
 
18
  import os
19
- import requests
20
- import gradio as gr
21
- from typing import List, Dict, Any
22
  import logging
 
 
 
 
 
 
 
 
 
23
 
24
  # Configure logging
25
  logging.basicConfig(
@@ -29,35 +36,39 @@ logging.basicConfig(
29
  logger = logging.getLogger(__name__)
30
 
31
  # Configuration
32
- BACKEND_URL = "https://dylanebert-research-tracker-backend.hf.space"
33
- HF_TOKEN = os.environ.get("HF_TOKEN")
34
  REQUEST_TIMEOUT = 30
 
 
 
 
35
 
36
  if not HF_TOKEN:
37
  logger.warning("HF_TOKEN not found in environment variables")
38
 
 
 
39
 
40
- def make_backend_request(endpoint: str, data: Dict[str, Any]) -> Dict[str, Any]:
41
- """Make a request to the research-tracker-backend."""
42
- url = f"{BACKEND_URL}/{endpoint}"
43
- headers = {
44
- "Content-Type": "application/json",
45
- "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
46
- "User-Agent": "Research-Tracker-MCP/1.0"
47
- }
48
-
49
- try:
50
- response = requests.post(url, json=data, headers=headers, timeout=REQUEST_TIMEOUT)
51
- response.raise_for_status()
52
- return response.json()
53
-
54
- except requests.exceptions.RequestException as e:
55
- logger.error(f"Backend request to {endpoint} failed: {e}")
56
- raise Exception(f"Backend request to {endpoint} failed: {str(e)}")
57
 
58
 
59
  def create_row_data(input_data: str) -> Dict[str, Any]:
60
- """Create standardized row data structure for backend requests."""
61
  row_data = {
62
  "Name": None,
63
  "Authors": [],
@@ -67,6 +78,9 @@ def create_row_data(input_data: str) -> Dict[str, Any]:
67
  "Space": None,
68
  "Model": None,
69
  "Dataset": None,
 
 
 
70
  }
71
 
72
  # Classify input based on URL patterns
@@ -91,23 +105,361 @@ def create_row_data(input_data: str) -> Dict[str, Any]:
91
  return row_data
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def infer_authors(input_data: str) -> List[str]:
95
  """
96
  Infer authors from research paper or project information.
97
 
98
- This function attempts to extract author names from various inputs like
99
- paper URLs (arXiv, Hugging Face papers), project pages, or repository links.
100
- It uses the research-tracker-backend inference engine with sophisticated
101
- author extraction from paper metadata and repository contributor information.
102
-
103
  Args:
104
  input_data (str): A URL, paper title, or other research-related input.
105
- Supports arXiv URLs, GitHub repositories, HuggingFace resources,
106
- project pages, and natural language paper titles.
107
 
108
  Returns:
109
  List[str]: A list of author names as strings, or empty list if no authors found.
110
- Authors are returned in the order they appear in the original source.
111
  """
112
  if not input_data or not input_data.strip():
113
  return []
@@ -115,22 +467,12 @@ def infer_authors(input_data: str) -> List[str]:
115
  try:
116
  cleaned_input = input_data.strip()
117
  row_data = create_row_data(cleaned_input)
118
- result = make_backend_request("infer-authors", row_data)
119
-
120
- # Extract and validate authors from response
121
- authors = result.get("authors", [])
122
- if isinstance(authors, str):
123
- # Handle comma-separated string format
124
- authors = [author.strip() for author in authors.split(",") if author.strip()]
125
- elif not isinstance(authors, list):
126
- authors = []
127
 
128
- # Filter out empty or invalid author names
129
  valid_authors = []
130
  for author in authors:
131
  if isinstance(author, str) and len(author.strip()) > 0:
132
  cleaned_author = author.strip()
133
- # Basic validation - authors should have reasonable length
134
  if 2 <= len(cleaned_author) <= 100:
135
  valid_authors.append(cleaned_author)
136
 
@@ -157,8 +499,8 @@ def infer_paper_url(input_data: str) -> str:
157
 
158
  try:
159
  row_data = create_row_data(input_data.strip())
160
- result = make_backend_request("infer-paper", row_data)
161
- return result.get("paper", "")
162
 
163
  except Exception as e:
164
  logger.error(f"Error inferring paper: {e}")
@@ -180,8 +522,8 @@ def infer_code_repository(input_data: str) -> str:
180
 
181
  try:
182
  row_data = create_row_data(input_data.strip())
183
- result = make_backend_request("infer-code", row_data)
184
- return result.get("code", "")
185
 
186
  except Exception as e:
187
  logger.error(f"Error inferring code: {e}")
@@ -203,8 +545,8 @@ def infer_research_name(input_data: str) -> str:
203
 
204
  try:
205
  row_data = create_row_data(input_data.strip())
206
- result = make_backend_request("infer-name", row_data)
207
- return result.get("name", "")
208
 
209
  except Exception as e:
210
  logger.error(f"Error inferring name: {e}")
@@ -215,9 +557,6 @@ def classify_research_url(input_data: str) -> str:
215
  """
216
  Classify the type of research-related URL or input.
217
 
218
- This function determines what type of research resource a given URL
219
- or input represents (paper, code, model, dataset, etc.).
220
-
221
  Args:
222
  input_data (str): The URL or input to classify
223
 
@@ -228,8 +567,7 @@ def classify_research_url(input_data: str) -> str:
228
  return "Unknown"
229
 
230
  try:
231
- result = make_backend_request("infer-field", {"value": input_data})
232
- field = result.get("field", "Unknown")
233
  return field if field else "Unknown"
234
 
235
  except Exception as e:
@@ -241,10 +579,6 @@ def infer_organizations(input_data: str) -> List[str]:
241
  """
242
  Infer affiliated organizations from research paper or project information.
243
 
244
- This function attempts to extract organization names from research metadata,
245
- author affiliations, and repository information using NLP analysis to identify
246
- institutional affiliations from paper authors and project contributors.
247
-
248
  Args:
249
  input_data (str): A URL, paper title, or other research-related input
250
 
@@ -256,15 +590,8 @@ def infer_organizations(input_data: str) -> List[str]:
256
 
257
  try:
258
  row_data = create_row_data(input_data.strip())
259
- result = make_backend_request("infer-orgs", row_data)
260
-
261
- orgs = result.get("orgs", [])
262
- if isinstance(orgs, str):
263
- orgs = [org.strip() for org in orgs.split(",") if org.strip()]
264
- elif not isinstance(orgs, list):
265
- orgs = []
266
-
267
- return orgs
268
 
269
  except Exception as e:
270
  logger.error(f"Error inferring organizations: {e}")
@@ -275,10 +602,6 @@ def infer_publication_date(input_data: str) -> str:
275
  """
276
  Infer publication date from research paper or project information.
277
 
278
- This function attempts to extract publication dates from paper metadata,
279
- repository creation dates, or release information. Returns dates in
280
- standardized format (YYYY-MM-DD) when possible.
281
-
282
  Args:
283
  input_data (str): A URL, paper title, or other research-related input
284
 
@@ -290,8 +613,8 @@ def infer_publication_date(input_data: str) -> str:
290
 
291
  try:
292
  row_data = create_row_data(input_data.strip())
293
- result = make_backend_request("infer-date", row_data)
294
- return result.get("date", "")
295
 
296
  except Exception as e:
297
  logger.error(f"Error inferring publication date: {e}")
@@ -302,10 +625,6 @@ def infer_model(input_data: str) -> str:
302
  """
303
  Infer associated HuggingFace model from research paper or project information.
304
 
305
- This function attempts to find HuggingFace models associated with research papers,
306
- GitHub repositories, or project pages. It searches for model references in papers,
307
- README files, and related documentation.
308
-
309
  Args:
310
  input_data (str): A URL, paper title, or other research-related input
311
 
@@ -317,8 +636,8 @@ def infer_model(input_data: str) -> str:
317
 
318
  try:
319
  row_data = create_row_data(input_data.strip())
320
- result = make_backend_request("infer-model", row_data)
321
- return result.get("model", "")
322
 
323
  except Exception as e:
324
  logger.error(f"Error inferring model: {e}")
@@ -329,10 +648,6 @@ def infer_dataset(input_data: str) -> str:
329
  """
330
  Infer associated HuggingFace dataset from research paper or project information.
331
 
332
- This function attempts to find HuggingFace datasets used or created by research papers,
333
- GitHub repositories, or projects. It analyzes paper content, repository documentation,
334
- and project descriptions.
335
-
336
  Args:
337
  input_data (str): A URL, paper title, or other research-related input
338
 
@@ -344,8 +659,8 @@ def infer_dataset(input_data: str) -> str:
344
 
345
  try:
346
  row_data = create_row_data(input_data.strip())
347
- result = make_backend_request("infer-dataset", row_data)
348
- return result.get("dataset", "")
349
 
350
  except Exception as e:
351
  logger.error(f"Error inferring dataset: {e}")
@@ -356,10 +671,6 @@ def infer_space(input_data: str) -> str:
356
  """
357
  Infer associated HuggingFace space from research paper or project information.
358
 
359
- This function attempts to find HuggingFace spaces (demos/applications) associated
360
- with research papers, models, or GitHub repositories. It looks for interactive
361
- demos and applications built around research.
362
-
363
  Args:
364
  input_data (str): A URL, paper title, or other research-related input
365
 
@@ -371,8 +682,8 @@ def infer_space(input_data: str) -> str:
371
 
372
  try:
373
  row_data = create_row_data(input_data.strip())
374
- result = make_backend_request("infer-space", row_data)
375
- return result.get("space", "")
376
 
377
  except Exception as e:
378
  logger.error(f"Error inferring space: {e}")
@@ -383,10 +694,6 @@ def infer_license(input_data: str) -> str:
383
  """
384
  Infer license information from research repository or project.
385
 
386
- This function attempts to extract license information from GitHub repositories,
387
- project documentation, or associated code. It checks license files, repository
388
- metadata, and project descriptions.
389
-
390
  Args:
391
  input_data (str): A URL, repository link, or other research-related input
392
 
@@ -398,8 +705,8 @@ def infer_license(input_data: str) -> str:
398
 
399
  try:
400
  row_data = create_row_data(input_data.strip())
401
- result = make_backend_request("infer-license", row_data)
402
- return result.get("license", "")
403
 
404
  except Exception as e:
405
  logger.error(f"Error inferring license: {e}")
@@ -410,31 +717,11 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
410
  """
411
  Find ALL related research resources across platforms for comprehensive analysis.
412
 
413
- This function performs a comprehensive analysis of a research item to find
414
- all related resources including papers, code repositories, models, datasets,
415
- spaces, and metadata. It's designed for building research knowledge graphs
416
- and understanding the complete ecosystem around a research topic.
417
-
418
  Args:
419
  input_data (str): A URL, paper title, or other research-related input
420
 
421
  Returns:
422
- Dict[str, Any]: Dictionary containing all discovered related resources:
423
- {
424
- "paper": str | None, # Associated research paper
425
- "code": str | None, # Code repository URL
426
- "name": str | None, # Research/project name
427
- "authors": List[str], # Author names
428
- "organizations": List[str], # Affiliated organizations
429
- "date": str | None, # Publication date
430
- "model": str | None, # HuggingFace model URL
431
- "dataset": str | None, # HuggingFace dataset URL
432
- "space": str | None, # HuggingFace space URL
433
- "license": str | None, # License information
434
- "field_type": str | None, # Classification of input type
435
- "success_count": int, # Number of successful inferences
436
- "total_inferences": int # Total inferences attempted
437
- }
438
  """
439
  if not input_data or not input_data.strip():
440
  return {"error": "Input data cannot be empty", "success_count": 0, "total_inferences": 0}
@@ -442,7 +729,6 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
442
  try:
443
  cleaned_input = input_data.strip()
444
 
445
- # Initialize result structure
446
  relationships = {
447
  "paper": None,
448
  "code": None,
@@ -456,10 +742,9 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
456
  "license": None,
457
  "field_type": None,
458
  "success_count": 0,
459
- "total_inferences": 11 # Number of inference types we'll attempt
460
  }
461
 
462
- # Define inference operations
463
  inferences = [
464
  ("paper", infer_paper_url),
465
  ("code", infer_code_repository),
@@ -476,23 +761,19 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
476
 
477
  logger.info(f"Finding research relationships for: {cleaned_input}")
478
 
479
- # Perform all inferences
480
  for field_name, inference_func in inferences:
481
  try:
482
  result = inference_func(cleaned_input)
483
 
484
- # Handle different return types
485
  if isinstance(result, list) and result:
486
  relationships[field_name] = result
487
  relationships["success_count"] += 1
488
  elif isinstance(result, str) and result.strip():
489
  relationships[field_name] = result.strip()
490
  relationships["success_count"] += 1
491
- # else: leave as None (unsuccessful inference)
492
 
493
  except Exception as e:
494
  logger.warning(f"Failed to infer {field_name}: {e}")
495
- # Continue with other inferences
496
 
497
  logger.info(f"Research relationship analysis completed: {relationships['success_count']}/{relationships['total_inferences']} successful")
498
  return relationships
 
3
 
4
  A clean, simple MCP server that provides research inference utilities.
5
  Exposes functions to infer research metadata from paper URLs, repository links,
6
+ or research names using embedded inference logic.
7
 
8
  Key Features:
9
  - Author inference from papers and repositories
 
16
  """
17
 
18
  import os
19
+ import re
 
 
20
  import logging
21
+ from urllib.parse import urlparse
22
+ from typing import List, Dict, Any, Optional
23
+
24
+ import gradio as gr
25
+ import requests
26
+ import feedparser
27
+ import spacy
28
+ from bs4 import BeautifulSoup
29
+ from fuzzywuzzy import fuzz
30
 
31
  # Configure logging
32
  logging.basicConfig(
 
36
  logger = logging.getLogger(__name__)
37
 
38
  # Configuration
 
 
39
  REQUEST_TIMEOUT = 30
40
+ ARXIV_API_BASE = "http://export.arxiv.org/api/query"
41
+ HUGGINGFACE_API_BASE = "https://huggingface.co/api"
42
+ HF_TOKEN = os.environ.get("HF_TOKEN")
43
+ GITHUB_AUTH = os.environ.get("GITHUB_AUTH")
44
 
45
  if not HF_TOKEN:
46
  logger.warning("HF_TOKEN not found in environment variables")
47
 
48
+ # Global spaCy model (loaded lazily)
49
+ nlp = None
50
 
51
+
52
+ # Utility functions
53
+ def get_arxiv_id(paper_url: str) -> Optional[str]:
54
+ """Extract arXiv ID from paper URL"""
55
+ if "arxiv.org/abs/" in paper_url:
56
+ return paper_url.split("arxiv.org/abs/")[1]
57
+ elif "huggingface.co/papers" in paper_url:
58
+ return paper_url.split("huggingface.co/papers/")[1]
59
+ return None
60
+
61
+
62
+ def extract_links_from_soup(soup, text):
63
+ """Extract both HTML and markdown links from soup and text"""
64
+ html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
65
+ link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
66
+ markdown_links = link_pattern.findall(text)
67
+ return html_links + markdown_links
68
 
69
 
70
  def create_row_data(input_data: str) -> Dict[str, Any]:
71
+ """Create standardized row data structure from input."""
72
  row_data = {
73
  "Name": None,
74
  "Authors": [],
 
78
  "Space": None,
79
  "Model": None,
80
  "Dataset": None,
81
+ "Orgs": [],
82
+ "License": None,
83
+ "Date": None,
84
  }
85
 
86
  # Classify input based on URL patterns
 
105
  return row_data
106
 
107
 
108
+ # Core inference functions
109
+ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
110
+ """Infer paper URL from row data"""
111
+ if row_data.get("Paper") is not None:
112
+ try:
113
+ url = urlparse(row_data["Paper"])
114
+ if url.scheme in ["http", "https"]:
115
+ if "arxiv.org/pdf/" in row_data["Paper"]:
116
+ new_url = row_data["Paper"].replace("/pdf/", "/abs/").replace(".pdf", "")
117
+ logger.info(f"Paper {new_url} inferred from {row_data['Paper']}")
118
+ return new_url
119
+ return row_data["Paper"]
120
+ except Exception:
121
+ pass
122
+
123
+ # Check if paper is in other fields
124
+ for field in ["Project", "Code", "Model", "Space", "Dataset", "Name"]:
125
+ if row_data.get(field) is not None:
126
+ if "arxiv" in row_data[field] or "huggingface.co/papers" in row_data[field]:
127
+ logger.info(f"Paper {row_data[field]} inferred from {field}")
128
+ return row_data[field]
129
+
130
+ # Try following project link and look for paper
131
+ if row_data.get("Project") is not None:
132
+ try:
133
+ r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
134
+ soup = BeautifulSoup(r.text, "html.parser")
135
+ for link in soup.find_all("a"):
136
+ href = link.get("href")
137
+ if href and ("arxiv" in href or "huggingface.co/papers" in href):
138
+ logger.info(f"Paper {href} inferred from Project")
139
+ return href
140
+ except Exception:
141
+ pass
142
+
143
+ # Try GitHub README parsing
144
+ if row_data.get("Code") is not None and GITHUB_AUTH and "github.com" in row_data["Code"]:
145
+ try:
146
+ headers = {"Authorization": f"Bearer {GITHUB_AUTH}"}
147
+ repo = row_data["Code"].split("github.com/")[1]
148
+ r = requests.get(f"https://api.github.com/repos/{repo}/readme", headers=headers, timeout=REQUEST_TIMEOUT)
149
+ readme = r.json()
150
+ if readme.get("type") == "file":
151
+ r = requests.get(readme["download_url"], timeout=REQUEST_TIMEOUT)
152
+ soup = BeautifulSoup(r.text, "html.parser")
153
+ links = extract_links_from_soup(soup, r.text)
154
+ for link in links:
155
+ if link and ("arxiv" in link or "huggingface.co/papers" in link):
156
+ logger.info(f"Paper {link} inferred from Code")
157
+ return link
158
+ except Exception:
159
+ pass
160
+
161
+ return None
162
+
163
+
164
+ def infer_name_from_row(row_data: Dict[str, Any]) -> Optional[str]:
165
+ """Infer research name from row data"""
166
+ if row_data.get("Name") is not None:
167
+ return row_data["Name"]
168
+
169
+ # Try to get name using arxiv api
170
+ if row_data.get("Paper") is not None:
171
+ arxiv_id = get_arxiv_id(row_data["Paper"])
172
+ if arxiv_id is not None:
173
+ try:
174
+ search_params = "id_list=" + arxiv_id
175
+ response = feedparser.parse(f"{ARXIV_API_BASE}?" + search_params)
176
+ if response.entries and len(response.entries) > 0:
177
+ entry = response.entries[0]
178
+ if hasattr(entry, "title"):
179
+ name = entry.title.strip()
180
+ logger.info(f"Name {name} inferred from Paper")
181
+ return name
182
+ except Exception:
183
+ pass
184
+
185
+ # Try to get from code repo
186
+ if row_data.get("Code") is not None and "github.com" in row_data["Code"]:
187
+ try:
188
+ repo = row_data["Code"].split("github.com/")[1]
189
+ name = repo.split("/")[1]
190
+ logger.info(f"Name {name} inferred from Code")
191
+ return name
192
+ except Exception:
193
+ pass
194
+
195
+ # Try to get from project page
196
+ if row_data.get("Project") is not None:
197
+ try:
198
+ r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
199
+ soup = BeautifulSoup(r.text, "html.parser")
200
+ if soup.title is not None:
201
+ name = soup.title.string.strip()
202
+ logger.info(f"Name {name} inferred from Project")
203
+ return name
204
+ except Exception:
205
+ pass
206
+
207
+ return None
208
+
209
+
210
+ def infer_code_from_row(row_data: Dict[str, Any]) -> Optional[str]:
211
+ """Infer code repository URL from row data"""
212
+ if row_data.get("Code") is not None:
213
+ try:
214
+ url = urlparse(row_data["Code"])
215
+ if url.scheme in ["http", "https"] and "github" in url.netloc:
216
+ return row_data["Code"]
217
+ except Exception:
218
+ pass
219
+
220
+ # Check if code is in other fields
221
+ for field in ["Project", "Paper", "Model", "Space", "Dataset", "Name"]:
222
+ if row_data.get(field) is not None:
223
+ try:
224
+ url = urlparse(row_data[field])
225
+ if url.scheme in ["http", "https"] and "github.com" in url.netloc:
226
+ logger.info(f"Code {row_data[field]} inferred from {field}")
227
+ return row_data[field]
228
+ except Exception:
229
+ pass
230
+
231
+ # Try to infer code from project page
232
+ if row_data.get("Project") is not None:
233
+ try:
234
+ r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
235
+ soup = BeautifulSoup(r.text, "html.parser")
236
+ links = extract_links_from_soup(soup, r.text)
237
+ for link in links:
238
+ if link:
239
+ try:
240
+ url = urlparse(link)
241
+ if url.scheme in ["http", "https"] and "github.com" in url.netloc:
242
+ logger.info(f"Code {link} inferred from Project")
243
+ return link
244
+ except Exception:
245
+ pass
246
+ except Exception:
247
+ pass
248
+
249
+ # Try GitHub search for papers
250
+ if row_data.get("Paper") is not None and "arxiv.org" in row_data["Paper"] and GITHUB_AUTH:
251
+ try:
252
+ arxiv_id = get_arxiv_id(row_data["Paper"])
253
+ if arxiv_id:
254
+ search_url = f"https://api.github.com/search/repositories?q={arxiv_id}&sort=stars&order=desc"
255
+ headers = {"Authorization": f"Bearer {GITHUB_AUTH}"}
256
+ search_response = requests.get(search_url, headers=headers, timeout=REQUEST_TIMEOUT)
257
+ if search_response.status_code == 200:
258
+ search_results = search_response.json()
259
+ if "items" in search_results and len(search_results["items"]) > 0:
260
+ repo = search_results["items"][0]
261
+ repo_url = repo["html_url"]
262
+ logger.info(f"Code {repo_url} inferred from Paper (GitHub search)")
263
+ return repo_url
264
+ except Exception as e:
265
+ logger.warning(f"Failed to infer code from paper: {e}")
266
+
267
+ return None
268
+
269
+
270
+ def infer_authors_from_row(row_data: Dict[str, Any]) -> List[str]:
271
+ """Infer authors from row data"""
272
+ authors = row_data.get("Authors", [])
273
+ if not isinstance(authors, list):
274
+ authors = []
275
+
276
+ if row_data.get("Paper") is not None:
277
+ arxiv_id = get_arxiv_id(row_data["Paper"])
278
+ if arxiv_id is not None:
279
+ try:
280
+ search_params = "id_list=" + arxiv_id
281
+ response = feedparser.parse(f"{ARXIV_API_BASE}?" + search_params)
282
+ if response.entries and len(response.entries) > 0:
283
+ entry = response.entries[0]
284
+ if hasattr(entry, 'authors'):
285
+ api_authors = entry.authors
286
+ for author in api_authors:
287
+ if author is None or not hasattr(author, "name"):
288
+ continue
289
+ if author.name not in authors and author.name != "arXiv api core":
290
+ authors.append(author.name)
291
+ logger.info(f"Author {author.name} inferred from Paper")
292
+ except Exception as e:
293
+ logger.warning(f"Failed to fetch authors from arXiv: {e}")
294
+
295
+ return authors
296
+
297
+
298
+ def infer_date_from_row(row_data: Dict[str, Any]) -> Optional[str]:
299
+ """Infer publication date from row data"""
300
+ if row_data.get("Paper") is not None:
301
+ arxiv_id = get_arxiv_id(row_data["Paper"])
302
+ if arxiv_id is not None:
303
+ try:
304
+ search_params = "id_list=" + arxiv_id
305
+ response = feedparser.parse(f"{ARXIV_API_BASE}?" + search_params)
306
+ if response.entries and len(response.entries) > 0:
307
+ entry = response.entries[0]
308
+ date = getattr(entry, "published", None) or getattr(entry, "updated", None)
309
+ if date is not None:
310
+ logger.info(f"Date {date} inferred from Paper")
311
+ return date
312
+ except Exception as e:
313
+ logger.warning(f"Failed to fetch date from arXiv: {e}")
314
+
315
+ return None
316
+
317
+
318
+ def infer_model_from_row(row_data: Dict[str, Any]) -> Optional[str]:
319
+ """Infer HuggingFace model from row data"""
320
+ known_model_mappings = {
321
+ "2010.11929": "https://huggingface.co/google/vit-base-patch16-224",
322
+ "1706.03762": "https://huggingface.co/bert-base-uncased",
323
+ "1810.04805": "https://huggingface.co/bert-base-uncased",
324
+ "2005.14165": "https://huggingface.co/t5-base",
325
+ "1907.11692": "https://huggingface.co/roberta-base",
326
+ }
327
+
328
+ if row_data.get("Paper") is not None:
329
+ arxiv_id = get_arxiv_id(row_data["Paper"])
330
+ if arxiv_id is not None and arxiv_id in known_model_mappings:
331
+ model_url = known_model_mappings[arxiv_id]
332
+ logger.info(f"Model {model_url} inferred from Paper (known mapping)")
333
+ return model_url
334
+
335
+ return None
336
+
337
+
338
+ def infer_dataset_from_row(row_data: Dict[str, Any]) -> Optional[str]:
339
+ """Infer HuggingFace dataset from row data"""
340
+ known_dataset_mappings = {
341
+ "2010.11929": "https://huggingface.co/datasets/imagenet-1k",
342
+ "1706.03762": "https://huggingface.co/datasets/wmt14",
343
+ "1810.04805": "https://huggingface.co/datasets/glue",
344
+ "2005.14165": "https://huggingface.co/datasets/c4",
345
+ "1907.11692": "https://huggingface.co/datasets/bookcorpus",
346
+ }
347
+
348
+ if row_data.get("Paper") is not None:
349
+ arxiv_id = get_arxiv_id(row_data["Paper"])
350
+ if arxiv_id is not None and arxiv_id in known_dataset_mappings:
351
+ dataset_url = known_dataset_mappings[arxiv_id]
352
+ logger.info(f"Dataset {dataset_url} inferred from Paper (known mapping)")
353
+ return dataset_url
354
+
355
+ return None
356
+
357
+
358
+ def infer_space_from_row(row_data: Dict[str, Any]) -> Optional[str]:
359
+ """Infer HuggingFace space from row data"""
360
+ if row_data.get("Model") is not None:
361
+ try:
362
+ model_id = row_data["Model"].split("huggingface.co/")[1]
363
+ url = f"{HUGGINGFACE_API_BASE}/spaces?models=" + model_id
364
+ r = requests.get(url, timeout=REQUEST_TIMEOUT)
365
+ spaces = r.json()
366
+ if len(spaces) > 0:
367
+ space = spaces[0]["id"]
368
+ space_url = "https://huggingface.co/spaces/" + space
369
+ logger.info(f"Space {space} inferred from Model")
370
+ return space_url
371
+ except Exception as e:
372
+ logger.warning(f"Failed to infer space from model: {e}")
373
+
374
+ return None
375
+
376
+
377
+ def infer_license_from_row(row_data: Dict[str, Any]) -> Optional[str]:
378
+ """Infer license information from row data"""
379
+ if row_data.get("Code") is not None and GITHUB_AUTH and "github.com" in row_data["Code"]:
380
+ try:
381
+ headers = {"Authorization": f"Bearer {GITHUB_AUTH}"}
382
+ repo = row_data["Code"].split("github.com/")[1]
383
+ r = requests.get(f"https://api.github.com/repos/{repo}/license", headers=headers, timeout=REQUEST_TIMEOUT)
384
+ if r.status_code == 200:
385
+ license_data = r.json()
386
+ if "license" in license_data and license_data["license"] is not None:
387
+ license_name = license_data["license"]["name"]
388
+ logger.info(f"License {license_name} inferred from Code")
389
+ return license_name
390
+ except Exception as e:
391
+ logger.warning(f"Failed to infer license from code: {e}")
392
+
393
+ return None
394
+
395
+
396
+ def infer_orgs_from_row(row_data: Dict[str, Any]) -> List[str]:
397
+ """Infer organizations from row data"""
398
+ global nlp
399
+ if nlp is None:
400
+ try:
401
+ nlp = spacy.load("en_core_web_sm")
402
+ except OSError as e:
403
+ logger.warning(f"Could not load spaCy model 'en_core_web_sm': {e}")
404
+ return row_data.get("Orgs", [])
405
+
406
+ orgs_input = row_data.get("Orgs", [])
407
+ if not orgs_input or not isinstance(orgs_input, list):
408
+ return []
409
+
410
+ orgs = []
411
+ for org in orgs_input:
412
+ if not org or not isinstance(org, str):
413
+ continue
414
+ doc = nlp(org)
415
+ for ent in doc.ents:
416
+ if ent.label_ == "ORG":
417
+ if ent.text == org and ent.text not in orgs:
418
+ orgs.append(ent.text)
419
+ break
420
+ if fuzz.ratio(ent.text, org) > 80 and ent.text not in orgs:
421
+ orgs.append(ent.text)
422
+ logger.info(f"Org {ent.text} inferred from {org}")
423
+ break
424
+
425
+ return orgs
426
+
427
+
428
+ def infer_field_type(value: str) -> str:
429
+ """Classify the type of research-related URL or input"""
430
+ if value is None:
431
+ return "Unknown"
432
+ if "arxiv.org/" in value or "huggingface.co/papers" in value or ".pdf" in value:
433
+ return "Paper"
434
+ if "github.com" in value:
435
+ return "Code"
436
+ if "huggingface.co/spaces" in value:
437
+ return "Space"
438
+ if "huggingface.co/datasets" in value:
439
+ return "Dataset"
440
+ if "github.io" in value:
441
+ return "Project"
442
+ if "huggingface.co/" in value:
443
+ try:
444
+ path = value.split("huggingface.co/")[1]
445
+ path_parts = path.strip("/").split("/")
446
+ if len(path_parts) >= 2 and not path.startswith(("spaces/", "datasets/", "papers/")):
447
+ return "Model"
448
+ except (IndexError, AttributeError):
449
+ pass
450
+ return "Unknown"
451
+
452
+
453
+ # MCP tool functions
454
  def infer_authors(input_data: str) -> List[str]:
455
  """
456
  Infer authors from research paper or project information.
457
 
 
 
 
 
 
458
  Args:
459
  input_data (str): A URL, paper title, or other research-related input.
 
 
460
 
461
  Returns:
462
  List[str]: A list of author names as strings, or empty list if no authors found.
 
463
  """
464
  if not input_data or not input_data.strip():
465
  return []
 
467
  try:
468
  cleaned_input = input_data.strip()
469
  row_data = create_row_data(cleaned_input)
470
+ authors = infer_authors_from_row(row_data)
 
 
 
 
 
 
 
 
471
 
 
472
  valid_authors = []
473
  for author in authors:
474
  if isinstance(author, str) and len(author.strip()) > 0:
475
  cleaned_author = author.strip()
 
476
  if 2 <= len(cleaned_author) <= 100:
477
  valid_authors.append(cleaned_author)
478
 
 
499
 
500
  try:
501
  row_data = create_row_data(input_data.strip())
502
+ result = infer_paper_from_row(row_data)
503
+ return result or ""
504
 
505
  except Exception as e:
506
  logger.error(f"Error inferring paper: {e}")
 
522
 
523
  try:
524
  row_data = create_row_data(input_data.strip())
525
+ result = infer_code_from_row(row_data)
526
+ return result or ""
527
 
528
  except Exception as e:
529
  logger.error(f"Error inferring code: {e}")
 
545
 
546
  try:
547
  row_data = create_row_data(input_data.strip())
548
+ result = infer_name_from_row(row_data)
549
+ return result or ""
550
 
551
  except Exception as e:
552
  logger.error(f"Error inferring name: {e}")
 
557
  """
558
  Classify the type of research-related URL or input.
559
 
 
 
 
560
  Args:
561
  input_data (str): The URL or input to classify
562
 
 
567
  return "Unknown"
568
 
569
  try:
570
+ field = infer_field_type(input_data)
 
571
  return field if field else "Unknown"
572
 
573
  except Exception as e:
 
579
  """
580
  Infer affiliated organizations from research paper or project information.
581
 
 
 
 
 
582
  Args:
583
  input_data (str): A URL, paper title, or other research-related input
584
 
 
590
 
591
  try:
592
  row_data = create_row_data(input_data.strip())
593
+ orgs = infer_orgs_from_row(row_data)
594
+ return orgs if isinstance(orgs, list) else []
 
 
 
 
 
 
 
595
 
596
  except Exception as e:
597
  logger.error(f"Error inferring organizations: {e}")
 
602
  """
603
  Infer publication date from research paper or project information.
604
 
 
 
 
 
605
  Args:
606
  input_data (str): A URL, paper title, or other research-related input
607
 
 
613
 
614
  try:
615
  row_data = create_row_data(input_data.strip())
616
+ result = infer_date_from_row(row_data)
617
+ return result or ""
618
 
619
  except Exception as e:
620
  logger.error(f"Error inferring publication date: {e}")
 
625
  """
626
  Infer associated HuggingFace model from research paper or project information.
627
 
 
 
 
 
628
  Args:
629
  input_data (str): A URL, paper title, or other research-related input
630
 
 
636
 
637
  try:
638
  row_data = create_row_data(input_data.strip())
639
+ result = infer_model_from_row(row_data)
640
+ return result or ""
641
 
642
  except Exception as e:
643
  logger.error(f"Error inferring model: {e}")
 
648
  """
649
  Infer associated HuggingFace dataset from research paper or project information.
650
 
 
 
 
 
651
  Args:
652
  input_data (str): A URL, paper title, or other research-related input
653
 
 
659
 
660
  try:
661
  row_data = create_row_data(input_data.strip())
662
+ result = infer_dataset_from_row(row_data)
663
+ return result or ""
664
 
665
  except Exception as e:
666
  logger.error(f"Error inferring dataset: {e}")
 
671
  """
672
  Infer associated HuggingFace space from research paper or project information.
673
 
 
 
 
 
674
  Args:
675
  input_data (str): A URL, paper title, or other research-related input
676
 
 
682
 
683
  try:
684
  row_data = create_row_data(input_data.strip())
685
+ result = infer_space_from_row(row_data)
686
+ return result or ""
687
 
688
  except Exception as e:
689
  logger.error(f"Error inferring space: {e}")
 
694
  """
695
  Infer license information from research repository or project.
696
 
 
 
 
 
697
  Args:
698
  input_data (str): A URL, repository link, or other research-related input
699
 
 
705
 
706
  try:
707
  row_data = create_row_data(input_data.strip())
708
+ result = infer_license_from_row(row_data)
709
+ return result or ""
710
 
711
  except Exception as e:
712
  logger.error(f"Error inferring license: {e}")
 
717
  """
718
  Find ALL related research resources across platforms for comprehensive analysis.
719
 
 
 
 
 
 
720
  Args:
721
  input_data (str): A URL, paper title, or other research-related input
722
 
723
  Returns:
724
+ Dict[str, Any]: Dictionary containing all discovered related resources
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725
  """
726
  if not input_data or not input_data.strip():
727
  return {"error": "Input data cannot be empty", "success_count": 0, "total_inferences": 0}
 
729
  try:
730
  cleaned_input = input_data.strip()
731
 
 
732
  relationships = {
733
  "paper": None,
734
  "code": None,
 
742
  "license": None,
743
  "field_type": None,
744
  "success_count": 0,
745
+ "total_inferences": 11
746
  }
747
 
 
748
  inferences = [
749
  ("paper", infer_paper_url),
750
  ("code", infer_code_repository),
 
761
 
762
  logger.info(f"Finding research relationships for: {cleaned_input}")
763
 
 
764
  for field_name, inference_func in inferences:
765
  try:
766
  result = inference_func(cleaned_input)
767
 
 
768
  if isinstance(result, list) and result:
769
  relationships[field_name] = result
770
  relationships["success_count"] += 1
771
  elif isinstance(result, str) and result.strip():
772
  relationships[field_name] = result.strip()
773
  relationships["success_count"] += 1
 
774
 
775
  except Exception as e:
776
  logger.warning(f"Failed to infer {field_name}: {e}")
 
777
 
778
  logger.info(f"Research relationship analysis completed: {relationships['success_count']}/{relationships['total_inferences']} successful")
779
  return relationships
requirements.txt CHANGED
@@ -1,2 +1,9 @@
1
  gradio[mcp]==5.38.2
2
- requests==2.32.4
 
 
 
 
 
 
 
 
1
  gradio[mcp]==5.38.2
2
+ requests==2.32.4
3
+ beautifulsoup4==4.13.4
4
+ feedparser==6.0.11
5
+ spacy==3.8.7
6
+ fuzzywuzzy==0.18.0
7
+ huggingface-hub==0.34.1
8
+ # Download spaCy English model - needed for organization inference
9
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl