dylanebert commited on
Commit
262aca8
·
1 Parent(s): 057e151
Files changed (1) hide show
  1. app.py +128 -566
app.py CHANGED
@@ -1,18 +1,31 @@
1
  """
2
  Research Tracker MCP Server
3
 
4
- A Gradio-based MCP server that provides research inference utilities.
5
- Delegates inference logic to the research-tracker-backend for consistency.
 
 
 
 
 
 
 
 
 
 
6
  """
7
 
8
  import os
9
  import requests
10
  import gradio as gr
11
- from typing import List, Dict, Any, Optional
12
  import logging
13
 
14
  # Configure logging
15
- logging.basicConfig(level=logging.INFO)
 
 
 
16
  logger = logging.getLogger(__name__)
17
 
18
  # Configuration
@@ -24,102 +37,27 @@ if not HF_TOKEN:
24
  logger.warning("HF_TOKEN not found in environment variables")
25
 
26
 
27
- def validate_input(input_data: str, input_name: str = "input") -> str:
28
- """
29
- Validate and sanitize input data.
30
-
31
- Args:
32
- input_data: The input string to validate
33
- input_name: Name of the input for error messages
34
-
35
- Returns:
36
- Cleaned input string
37
-
38
- Raises:
39
- ValueError: If input is invalid
40
- """
41
- if not input_data:
42
- raise ValueError(f"{input_name} cannot be empty or None")
43
-
44
- cleaned = input_data.strip()
45
- if not cleaned:
46
- raise ValueError(f"{input_name} cannot be empty after trimming")
47
-
48
- # Basic URL validation if it looks like a URL
49
- if cleaned.startswith(("http://", "https://")):
50
- if len(cleaned) > 2000:
51
- raise ValueError(f"{input_name} URL is too long (max 2000 characters)")
52
- # Check for suspicious patterns
53
- suspicious_patterns = ["javascript:", "data:", "file:", "ftp:"]
54
- if any(pattern in cleaned.lower() for pattern in suspicious_patterns):
55
- raise ValueError(f"{input_name} contains invalid URL scheme")
56
-
57
- return cleaned
58
-
59
-
60
  def make_backend_request(endpoint: str, data: Dict[str, Any]) -> Dict[str, Any]:
61
- """
62
- Make a request to the research-tracker-backend with comprehensive error handling.
63
-
64
- Args:
65
- endpoint: The backend endpoint to call (e.g., 'infer-authors')
66
- data: The data to send in the request body
67
-
68
- Returns:
69
- The response data from the backend
70
-
71
- Raises:
72
- Exception: If the request fails or returns an error
73
- """
74
- if not HF_TOKEN:
75
- logger.warning("HF_TOKEN not available - backend requests may fail")
76
-
77
  url = f"{BACKEND_URL}/{endpoint}"
78
  headers = {
79
  "Content-Type": "application/json",
80
- "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else ""
 
81
  }
82
 
83
  try:
84
- logger.debug(f"Making request to {endpoint} with data: {data}")
85
  response = requests.post(url, json=data, headers=headers, timeout=REQUEST_TIMEOUT)
86
-
87
- if response.status_code == 401:
88
- raise Exception("Authentication failed - please check HF_TOKEN")
89
- elif response.status_code == 403:
90
- raise Exception("Access forbidden - insufficient permissions")
91
- elif response.status_code == 404:
92
- raise Exception(f"Backend endpoint {endpoint} not found")
93
- elif response.status_code == 422:
94
- raise Exception("Invalid request data format")
95
- elif response.status_code >= 500:
96
- raise Exception(f"Backend server error (status {response.status_code})")
97
-
98
  response.raise_for_status()
99
- result = response.json()
100
- logger.debug(f"Backend response: {result}")
101
- return result
102
-
103
- except requests.exceptions.Timeout:
104
- raise Exception(f"Backend request to {endpoint} timed out after {REQUEST_TIMEOUT}s")
105
- except requests.exceptions.ConnectionError:
106
- raise Exception(f"Failed to connect to backend - service may be unavailable")
107
  except requests.exceptions.RequestException as e:
 
108
  raise Exception(f"Backend request to {endpoint} failed: {str(e)}")
109
- except ValueError as e:
110
- raise Exception(f"Invalid JSON response from backend: {str(e)}")
111
 
112
 
113
  def create_row_data(input_data: str) -> Dict[str, Any]:
114
- """
115
- Create standardized row data structure for backend requests.
116
-
117
- Args:
118
- input_data: The input string to analyze
119
-
120
- Returns:
121
- Dictionary with appropriate field populated
122
- """
123
  row_data = {
124
  "Name": None,
125
  "Authors": [],
@@ -163,35 +101,20 @@ def infer_authors(input_data: str) -> List[str]:
163
  author extraction from paper metadata and repository contributor information.
164
 
165
  Args:
166
- input_data: A URL, paper title, or other research-related input.
167
- Supports arXiv URLs, GitHub repositories, HuggingFace resources,
168
- project pages, and natural language paper titles.
169
 
170
  Returns:
171
- A list of author names as strings, or empty list if no authors found.
172
- Authors are returned in the order they appear in the original source.
173
-
174
- Examples:
175
- >>> infer_authors("https://arxiv.org/abs/2010.11929")
176
- ["Alexey Dosovitskiy", "Lucas Beyer", "Alexander Kolesnikov", ...]
177
-
178
- >>> infer_authors("https://github.com/google-research/vision_transformer")
179
- ["Alexey Dosovitskiy", "Lucas Beyer", ...]
180
-
181
- >>> infer_authors("Vision Transformer")
182
- ["Alexey Dosovitskiy", "Lucas Beyer", ...]
183
-
184
- Raises:
185
- No exceptions are raised - errors are logged and empty list returned.
186
  """
 
 
 
187
  try:
188
- # Validate and clean input
189
- cleaned_input = validate_input(input_data, "input_data")
190
-
191
- # Create structured data for backend
192
  row_data = create_row_data(cleaned_input)
193
-
194
- # Call the backend
195
  result = make_backend_request("infer-authors", row_data)
196
 
197
  # Extract and validate authors from response
@@ -200,7 +123,6 @@ def infer_authors(input_data: str) -> List[str]:
200
  # Handle comma-separated string format
201
  authors = [author.strip() for author in authors.split(",") if author.strip()]
202
  elif not isinstance(authors, list):
203
- logger.warning(f"Unexpected authors format: {type(authors)}")
204
  authors = []
205
 
206
  # Filter out empty or invalid author names
@@ -215,9 +137,6 @@ def infer_authors(input_data: str) -> List[str]:
215
  logger.info(f"Successfully inferred {len(valid_authors)} authors from input")
216
  return valid_authors
217
 
218
- except ValueError as e:
219
- logger.error(f"Input validation error: {e}")
220
- return []
221
  except Exception as e:
222
  logger.error(f"Error inferring authors: {e}")
223
  return []
@@ -228,10 +147,10 @@ def infer_paper_url(input_data: str) -> str:
228
  Infer the paper URL from various research-related inputs.
229
 
230
  Args:
231
- input_data: A URL, repository link, or other research-related input
232
 
233
  Returns:
234
- The paper URL (typically arXiv or Hugging Face papers), or empty string if not found
235
  """
236
  if not input_data or not input_data.strip():
237
  return ""
@@ -251,10 +170,10 @@ def infer_code_repository(input_data: str) -> str:
251
  Infer the code repository URL from research-related inputs.
252
 
253
  Args:
254
- input_data: A URL, paper link, or other research-related input
255
 
256
  Returns:
257
- The code repository URL (typically GitHub), or empty string if not found
258
  """
259
  if not input_data or not input_data.strip():
260
  return ""
@@ -274,10 +193,10 @@ def infer_research_name(input_data: str) -> str:
274
  Infer the research paper or project name from various inputs.
275
 
276
  Args:
277
- input_data: A URL, repository link, or other research-related input
278
 
279
  Returns:
280
- The research name/title, or empty string if not found
281
  """
282
  if not input_data or not input_data.strip():
283
  return ""
@@ -292,7 +211,7 @@ def infer_research_name(input_data: str) -> str:
292
  return ""
293
 
294
 
295
- def classify_research_url(url: str) -> str:
296
  """
297
  Classify the type of research-related URL or input.
298
 
@@ -300,29 +219,16 @@ def classify_research_url(url: str) -> str:
300
  or input represents (paper, code, model, dataset, etc.).
301
 
302
  Args:
303
- url: The URL or input to classify
304
 
305
  Returns:
306
- The field type: "Paper", "Code", "Space", "Model", "Dataset", "Project", or "Unknown"
307
-
308
- Examples:
309
- >>> classify_research_url("https://arxiv.org/abs/2010.11929")
310
- "Paper"
311
-
312
- >>> classify_research_url("https://github.com/google-research/vision_transformer")
313
- "Code"
314
-
315
- >>> classify_research_url("https://huggingface.co/google/vit-base-patch16-224")
316
- "Model"
317
  """
318
- if not url or not url.strip():
319
  return "Unknown"
320
 
321
  try:
322
- # Call the backend
323
- result = make_backend_request("infer-field", {"value": url})
324
-
325
- # Extract field from response
326
  field = result.get("field", "Unknown")
327
  return field if field else "Unknown"
328
 
@@ -335,11 +241,15 @@ def infer_organizations(input_data: str) -> List[str]:
335
  """
336
  Infer affiliated organizations from research paper or project information.
337
 
 
 
 
 
338
  Args:
339
- input_data: A URL, paper title, or other research-related input
340
 
341
  Returns:
342
- A list of organization names, or empty list if no organizations found
343
  """
344
  if not input_data or not input_data.strip():
345
  return []
@@ -365,11 +275,15 @@ def infer_publication_date(input_data: str) -> str:
365
  """
366
  Infer publication date from research paper or project information.
367
 
 
 
 
 
368
  Args:
369
- input_data: A URL, paper title, or other research-related input
370
 
371
  Returns:
372
- Publication date as string (YYYY-MM-DD format), or empty string if not found
373
  """
374
  if not input_data or not input_data.strip():
375
  return ""
@@ -388,11 +302,15 @@ def infer_model(input_data: str) -> str:
388
  """
389
  Infer associated HuggingFace model from research paper or project information.
390
 
 
 
 
 
391
  Args:
392
- input_data: A URL, paper title, or other research-related input
393
 
394
  Returns:
395
- HuggingFace model URL, or empty string if no model found
396
  """
397
  if not input_data or not input_data.strip():
398
  return ""
@@ -411,11 +329,15 @@ def infer_dataset(input_data: str) -> str:
411
  """
412
  Infer associated HuggingFace dataset from research paper or project information.
413
 
 
 
 
 
414
  Args:
415
- input_data: A URL, paper title, or other research-related input
416
 
417
  Returns:
418
- HuggingFace dataset URL, or empty string if no dataset found
419
  """
420
  if not input_data or not input_data.strip():
421
  return ""
@@ -434,11 +356,15 @@ def infer_space(input_data: str) -> str:
434
  """
435
  Infer associated HuggingFace space from research paper or project information.
436
 
 
 
 
 
437
  Args:
438
- input_data: A URL, paper title, or other research-related input
439
 
440
  Returns:
441
- HuggingFace space URL, or empty string if no space found
442
  """
443
  if not input_data or not input_data.strip():
444
  return ""
@@ -457,11 +383,15 @@ def infer_license(input_data: str) -> str:
457
  """
458
  Infer license information from research repository or project.
459
 
 
 
 
 
460
  Args:
461
- input_data: A URL, repository link, or other research-related input
462
 
463
  Returns:
464
- License name/type, or empty string if no license found
465
  """
466
  if not input_data or not input_data.strip():
467
  return ""
@@ -476,111 +406,6 @@ def infer_license(input_data: str) -> str:
476
  return ""
477
 
478
 
479
- def batch_infer_research(input_list: List[str], inference_type: str = "authors") -> List[Dict[str, Any]]:
480
- """
481
- Perform batch inference on multiple research items for scale analysis.
482
-
483
- This function processes multiple research URLs or titles simultaneously,
484
- applying the specified inference type to each item. Useful for analyzing
485
- large research datasets, comparing multiple papers, or building research
486
- knowledge graphs.
487
-
488
- Args:
489
- input_list: List of URLs, paper titles, or research-related inputs to process
490
- inference_type: Type of inference to perform on each item.
491
- Options: "authors", "paper", "code", "name", "organizations",
492
- "date", "model", "dataset", "space", "license", "classify"
493
-
494
- Returns:
495
- List of dictionaries, each containing:
496
- - "input": The original input string
497
- - "result": The inference result (format depends on inference_type)
498
- - "success": Boolean indicating if inference succeeded
499
- - "error": Error message if inference failed
500
-
501
- Examples:
502
- >>> papers = [
503
- ... "https://arxiv.org/abs/2010.11929",
504
- ... "https://arxiv.org/abs/1706.03762",
505
- ... "https://github.com/openai/gpt-2"
506
- ... ]
507
- >>> results = batch_infer_research(papers, "authors")
508
- >>> for result in results:
509
- ... print(f"{result['input']}: {len(result['result'])} authors")
510
-
511
- >>> urls = ["https://huggingface.co/bert-base-uncased", "https://github.com/pytorch/pytorch"]
512
- >>> classifications = batch_infer_research(urls, "classify")
513
-
514
- Notes:
515
- - Processing is done sequentially to avoid overwhelming the backend
516
- - Failed inferences return empty results rather than raising exceptions
517
- - Large batches may take significant time - consider chunking for very large datasets
518
- """
519
- if not input_list:
520
- return []
521
-
522
- # Map inference types to their corresponding functions
523
- inference_functions = {
524
- "authors": infer_authors,
525
- "paper": infer_paper_url,
526
- "code": infer_code_repository,
527
- "name": infer_research_name,
528
- "organizations": infer_organizations,
529
- "date": infer_publication_date,
530
- "model": infer_model,
531
- "dataset": infer_dataset,
532
- "space": infer_space,
533
- "license": infer_license,
534
- "classify": classify_research_url,
535
- }
536
-
537
- if inference_type not in inference_functions:
538
- logger.error(f"Invalid inference type: {inference_type}")
539
- return []
540
-
541
- inference_func = inference_functions[inference_type]
542
- results = []
543
-
544
- logger.info(f"Starting batch inference of type '{inference_type}' on {len(input_list)} items")
545
-
546
- for i, input_item in enumerate(input_list):
547
- try:
548
- if not input_item or not isinstance(input_item, str):
549
- results.append({
550
- "input": str(input_item),
551
- "result": None,
552
- "success": False,
553
- "error": "Invalid input: must be non-empty string"
554
- })
555
- continue
556
-
557
- # Perform inference
558
- result = inference_func(input_item)
559
-
560
- results.append({
561
- "input": input_item,
562
- "result": result,
563
- "success": True,
564
- "error": None
565
- })
566
-
567
- logger.debug(f"Batch item {i+1}/{len(input_list)} completed successfully")
568
-
569
- except Exception as e:
570
- logger.error(f"Batch inference failed for item {i+1}: {e}")
571
- results.append({
572
- "input": input_item,
573
- "result": None,
574
- "success": False,
575
- "error": str(e)
576
- })
577
-
578
- successful_count = sum(1 for r in results if r["success"])
579
- logger.info(f"Batch inference completed: {successful_count}/{len(input_list)} successful")
580
-
581
- return results
582
-
583
-
584
  def find_research_relationships(input_data: str) -> Dict[str, Any]:
585
  """
586
  Find ALL related research resources across platforms for comprehensive analysis.
@@ -591,10 +416,10 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
591
  and understanding the complete ecosystem around a research topic.
592
 
593
  Args:
594
- input_data: A URL, paper title, or other research-related input
595
 
596
  Returns:
597
- Dictionary containing all discovered related resources:
598
  {
599
  "paper": str | None, # Associated research paper
600
  "code": str | None, # Code repository URL
@@ -610,23 +435,12 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
610
  "success_count": int, # Number of successful inferences
611
  "total_inferences": int # Total inferences attempted
612
  }
613
-
614
- Examples:
615
- >>> relationships = find_research_relationships("https://arxiv.org/abs/2010.11929")
616
- >>> print(f"Found {relationships['success_count']} related resources")
617
- >>> print(f"Authors: {relationships['authors']}")
618
- >>> print(f"Code: {relationships['code']}")
619
- >>> print(f"Model: {relationships['model']}")
620
-
621
- >>> ecosystem = find_research_relationships("Vision Transformer")
622
- >>> if ecosystem['paper']:
623
- ... print(f"Paper: {ecosystem['paper']}")
624
- >>> if ecosystem['code']:
625
- ... print(f"Implementation: {ecosystem['code']}")
626
  """
 
 
 
627
  try:
628
- # Validate input
629
- cleaned_input = validate_input(input_data, "input_data")
630
 
631
  # Initialize result structure
632
  relationships = {
@@ -683,306 +497,54 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
683
  logger.info(f"Research relationship analysis completed: {relationships['success_count']}/{relationships['total_inferences']} successful")
684
  return relationships
685
 
686
- except ValueError as e:
687
- logger.error(f"Input validation error: {e}")
688
- return {"error": str(e), "success_count": 0, "total_inferences": 0}
689
  except Exception as e:
690
  logger.error(f"Error finding research relationships: {e}")
691
  return {"error": str(e), "success_count": 0, "total_inferences": 0}
692
 
693
 
694
- def validate_research_urls(urls: List[str]) -> List[Dict[str, Any]]:
695
- """
696
- Validate accessibility and format of research URLs at scale.
697
-
698
- This function checks multiple research URLs for accessibility, format
699
- validity, and basic content analysis. Useful for data cleaning,
700
- link validation, and quality assurance of research datasets.
701
-
702
- Args:
703
- urls: List of URLs to validate
704
-
705
- Returns:
706
- List of validation results, each containing:
707
- - "url": The original URL
708
- - "accessible": Boolean indicating if URL is reachable
709
- - "status_code": HTTP status code (if applicable)
710
- - "format_valid": Boolean indicating if URL format is valid
711
- - "platform": Detected platform (arxiv, github, huggingface, etc.)
712
- - "error": Error message if validation failed
713
-
714
- Examples:
715
- >>> urls = [
716
- ... "https://arxiv.org/abs/2010.11929",
717
- ... "https://github.com/google-research/vision_transformer",
718
- ... "https://invalid-url-example"
719
- ... ]
720
- >>> validation_results = validate_research_urls(urls)
721
- >>> accessible_urls = [r for r in validation_results if r["accessible"]]
722
- >>> print(f"{len(accessible_urls)}/{len(urls)} URLs are accessible")
723
- """
724
- if not urls:
725
- return []
726
-
727
- results = []
728
- logger.info(f"Validating {len(urls)} research URLs")
729
-
730
- for url in urls:
731
- result = {
732
- "url": url,
733
- "accessible": False,
734
- "status_code": None,
735
- "format_valid": False,
736
- "platform": "unknown",
737
- "error": None
738
- }
739
-
740
- try:
741
- # Basic format validation
742
- if not isinstance(url, str) or not url.strip():
743
- result["error"] = "Invalid URL format: empty or non-string"
744
- results.append(result)
745
- continue
746
-
747
- cleaned_url = url.strip()
748
-
749
- # URL format validation
750
- if not cleaned_url.startswith(("http://", "https://")):
751
- result["error"] = "Invalid URL format: must start with http:// or https://"
752
- results.append(result)
753
- continue
754
-
755
- result["format_valid"] = True
756
-
757
- # Platform detection
758
- if "arxiv.org" in cleaned_url:
759
- result["platform"] = "arxiv"
760
- elif "github.com" in cleaned_url:
761
- result["platform"] = "github"
762
- elif "huggingface.co" in cleaned_url:
763
- result["platform"] = "huggingface"
764
- elif "github.io" in cleaned_url:
765
- result["platform"] = "github_pages"
766
-
767
- # Accessibility check
768
- try:
769
- response = requests.head(cleaned_url, timeout=10, allow_redirects=True)
770
- result["status_code"] = response.status_code
771
- result["accessible"] = 200 <= response.status_code < 400
772
-
773
- except requests.exceptions.Timeout:
774
- result["error"] = "Timeout: URL not accessible within 10 seconds"
775
- except requests.exceptions.ConnectionError:
776
- result["error"] = "Connection error: Unable to reach URL"
777
- except requests.exceptions.RequestException as e:
778
- result["error"] = f"Request failed: {str(e)}"
779
-
780
- except Exception as e:
781
- result["error"] = f"Validation error: {str(e)}"
782
-
783
- results.append(result)
784
-
785
- accessible_count = sum(1 for r in results if r["accessible"])
786
- logger.info(f"URL validation completed: {accessible_count}/{len(urls)} accessible")
787
-
788
- return results
789
-
790
-
791
- # Create Gradio interface
792
- def create_demo():
793
- """Create the Gradio demo interface for testing."""
794
-
795
- with gr.Blocks(title="Research Tracker MCP Server") as demo:
796
- gr.Markdown("# Research Tracker MCP Server")
797
- gr.Markdown("Test the comprehensive research inference utilities available through MCP. This server provides cross-platform research analysis, batch processing, and relationship discovery.")
798
-
799
- # Core inference functions
800
- with gr.TabItem("Core Inference"):
801
- with gr.Tab("Authors"):
802
- with gr.Row():
803
- author_input = gr.Textbox(
804
- label="Input (URL, paper title, etc.)",
805
- placeholder="https://arxiv.org/abs/2010.11929",
806
- lines=1
807
- )
808
- author_output = gr.JSON(label="Authors")
809
- author_btn = gr.Button("Infer Authors")
810
- author_btn.click(infer_authors, inputs=author_input, outputs=author_output)
811
-
812
- with gr.Tab("Paper"):
813
- with gr.Row():
814
- paper_input = gr.Textbox(
815
- label="Input (GitHub repo, project name, etc.)",
816
- placeholder="https://github.com/google-research/vision_transformer",
817
- lines=1
818
- )
819
- paper_output = gr.Textbox(label="Paper URL")
820
- paper_btn = gr.Button("Infer Paper")
821
- paper_btn.click(infer_paper_url, inputs=paper_input, outputs=paper_output)
822
-
823
- with gr.Tab("Code"):
824
- with gr.Row():
825
- code_input = gr.Textbox(
826
- label="Input (paper URL, project name, etc.)",
827
- placeholder="https://arxiv.org/abs/2010.11929",
828
- lines=1
829
- )
830
- code_output = gr.Textbox(label="Code Repository URL")
831
- code_btn = gr.Button("Infer Code")
832
- code_btn.click(infer_code_repository, inputs=code_input, outputs=code_output)
833
-
834
- with gr.Tab("Name"):
835
- with gr.Row():
836
- name_input = gr.Textbox(
837
- label="Input (URL, repo, etc.)",
838
- placeholder="https://github.com/google-research/vision_transformer",
839
- lines=1
840
- )
841
- name_output = gr.Textbox(label="Research Name/Title")
842
- name_btn = gr.Button("Infer Name")
843
- name_btn.click(infer_research_name, inputs=name_input, outputs=name_output)
844
-
845
- with gr.Tab("Classify"):
846
- with gr.Row():
847
- classify_input = gr.Textbox(
848
- label="URL to classify",
849
- placeholder="https://huggingface.co/google/vit-base-patch16-224",
850
- lines=1
851
- )
852
- classify_output = gr.Textbox(label="URL Type")
853
- classify_btn = gr.Button("Classify URL")
854
- classify_btn.click(classify_research_url, inputs=classify_input, outputs=classify_output)
855
-
856
- # Extended inference functions
857
- with gr.TabItem("Extended Inference"):
858
- with gr.Tab("Organizations"):
859
- with gr.Row():
860
- orgs_input = gr.Textbox(
861
- label="Input (paper URL, repo, etc.)",
862
- placeholder="https://arxiv.org/abs/2010.11929",
863
- lines=1
864
- )
865
- orgs_output = gr.JSON(label="Organizations")
866
- orgs_btn = gr.Button("Infer Organizations")
867
- orgs_btn.click(infer_organizations, inputs=orgs_input, outputs=orgs_output)
868
-
869
- with gr.Tab("Publication Date"):
870
- with gr.Row():
871
- date_input = gr.Textbox(
872
- label="Input (paper URL, repo, etc.)",
873
- placeholder="https://arxiv.org/abs/2010.11929",
874
- lines=1
875
- )
876
- date_output = gr.Textbox(label="Publication Date")
877
- date_btn = gr.Button("Infer Date")
878
- date_btn.click(infer_publication_date, inputs=date_input, outputs=date_output)
879
-
880
- with gr.Tab("Model"):
881
- with gr.Row():
882
- model_input = gr.Textbox(
883
- label="Input (paper URL, project name, etc.)",
884
- placeholder="https://arxiv.org/abs/2010.11929",
885
- lines=1
886
- )
887
- model_output = gr.Textbox(label="HuggingFace Model URL")
888
- model_btn = gr.Button("Infer Model")
889
- model_btn.click(infer_model, inputs=model_input, outputs=model_output)
890
-
891
- with gr.Tab("Dataset"):
892
- with gr.Row():
893
- dataset_input = gr.Textbox(
894
- label="Input (paper URL, project name, etc.)",
895
- placeholder="https://arxiv.org/abs/1706.03762",
896
- lines=1
897
- )
898
- dataset_output = gr.Textbox(label="HuggingFace Dataset URL")
899
- dataset_btn = gr.Button("Infer Dataset")
900
- dataset_btn.click(infer_dataset, inputs=dataset_input, outputs=dataset_output)
901
-
902
- with gr.Tab("Space"):
903
- with gr.Row():
904
- space_input = gr.Textbox(
905
- label="Input (model URL, paper, etc.)",
906
- placeholder="https://huggingface.co/google/vit-base-patch16-224",
907
- lines=1
908
- )
909
- space_output = gr.Textbox(label="HuggingFace Space URL")
910
- space_btn = gr.Button("Infer Space")
911
- space_btn.click(infer_space, inputs=space_input, outputs=space_output)
912
-
913
- with gr.Tab("License"):
914
- with gr.Row():
915
- license_input = gr.Textbox(
916
- label="Input (repository URL, project, etc.)",
917
- placeholder="https://github.com/google-research/vision_transformer",
918
- lines=1
919
- )
920
- license_output = gr.Textbox(label="License Information")
921
- license_btn = gr.Button("Infer License")
922
- license_btn.click(infer_license, inputs=license_input, outputs=license_output)
923
-
924
- # Research intelligence functions
925
- with gr.TabItem("Research Intelligence"):
926
- with gr.Tab("Research Relationships"):
927
- gr.Markdown("Find ALL related resources for comprehensive research analysis")
928
- with gr.Row():
929
- relationships_input = gr.Textbox(
930
- label="Input (URL, paper title, etc.)",
931
- placeholder="https://arxiv.org/abs/2010.11929",
932
- lines=1
933
- )
934
- relationships_output = gr.JSON(label="Related Resources")
935
- relationships_btn = gr.Button("Find Research Relationships")
936
- relationships_btn.click(find_research_relationships, inputs=relationships_input, outputs=relationships_output)
937
-
938
- with gr.Tab("Batch Processing"):
939
- gr.Markdown("Process multiple research items simultaneously")
940
- with gr.Row():
941
- with gr.Column():
942
- batch_input = gr.Textbox(
943
- label="Input URLs/Titles (one per line)",
944
- placeholder="https://arxiv.org/abs/2010.11929\nhttps://github.com/openai/gpt-2\nVision Transformer",
945
- lines=5
946
- )
947
- batch_type = gr.Dropdown(
948
- choices=["authors", "paper", "code", "name", "organizations", "date", "model", "dataset", "space", "license", "classify"],
949
- value="authors",
950
- label="Inference Type"
951
- )
952
- batch_output = gr.JSON(label="Batch Results")
953
-
954
- def process_batch(input_text, inference_type):
955
- if not input_text.strip():
956
- return []
957
- input_list = [line.strip() for line in input_text.strip().split('\n') if line.strip()]
958
- return batch_infer_research(input_list, inference_type)
959
-
960
- batch_btn = gr.Button("Process Batch")
961
- batch_btn.click(process_batch, inputs=[batch_input, batch_type], outputs=batch_output)
962
-
963
- with gr.Tab("URL Validation"):
964
- gr.Markdown("Validate accessibility and format of research URLs")
965
- with gr.Row():
966
- with gr.Column():
967
- url_input = gr.Textbox(
968
- label="URLs to validate (one per line)",
969
- placeholder="https://arxiv.org/abs/2010.11929\nhttps://github.com/google-research/vision_transformer\nhttps://huggingface.co/google/vit-base-patch16-224",
970
- lines=5
971
- )
972
- url_output = gr.JSON(label="Validation Results")
973
-
974
- def validate_urls(input_text):
975
- if not input_text.strip():
976
- return []
977
- url_list = [line.strip() for line in input_text.strip().split('\n') if line.strip()]
978
- return validate_research_urls(url_list)
979
-
980
- url_btn = gr.Button("Validate URLs")
981
- url_btn.click(validate_urls, inputs=url_input, outputs=url_output)
982
-
983
- return demo
984
 
985
 
986
  if __name__ == "__main__":
987
- demo = create_demo()
988
  demo.launch(mcp_server=True, share=False)
 
1
  """
2
  Research Tracker MCP Server
3
 
4
+ A clean, simple MCP server that provides research inference utilities.
5
+ Exposes functions to infer research metadata from paper URLs, repository links,
6
+ or research names using the research-tracker-backend inference engine.
7
+
8
+ Key Features:
9
+ - Author inference from papers and repositories
10
+ - Cross-platform resource discovery (papers, code, models, datasets)
11
+ - Research metadata extraction (names, dates, licenses, organizations)
12
+ - URL classification and relationship mapping
13
+ - Comprehensive research ecosystem analysis
14
+
15
+ All functions are optimized for MCP usage with clear type hints and docstrings.
16
  """
17
 
18
  import os
19
  import requests
20
  import gradio as gr
21
+ from typing import List, Dict, Any
22
  import logging
23
 
24
  # Configure logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
28
+ )
29
  logger = logging.getLogger(__name__)
30
 
31
  # Configuration
 
37
  logger.warning("HF_TOKEN not found in environment variables")
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def make_backend_request(endpoint: str, data: Dict[str, Any]) -> Dict[str, Any]:
41
+ """Make a request to the research-tracker-backend."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  url = f"{BACKEND_URL}/{endpoint}"
43
  headers = {
44
  "Content-Type": "application/json",
45
+ "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
46
+ "User-Agent": "Research-Tracker-MCP/1.0"
47
  }
48
 
49
  try:
 
50
  response = requests.post(url, json=data, headers=headers, timeout=REQUEST_TIMEOUT)
 
 
 
 
 
 
 
 
 
 
 
 
51
  response.raise_for_status()
52
+ return response.json()
53
+
 
 
 
 
 
 
54
  except requests.exceptions.RequestException as e:
55
+ logger.error(f"Backend request to {endpoint} failed: {e}")
56
  raise Exception(f"Backend request to {endpoint} failed: {str(e)}")
 
 
57
 
58
 
59
  def create_row_data(input_data: str) -> Dict[str, Any]:
60
+ """Create standardized row data structure for backend requests."""
 
 
 
 
 
 
 
 
61
  row_data = {
62
  "Name": None,
63
  "Authors": [],
 
101
  author extraction from paper metadata and repository contributor information.
102
 
103
  Args:
104
+ input_data (str): A URL, paper title, or other research-related input.
105
+ Supports arXiv URLs, GitHub repositories, HuggingFace resources,
106
+ project pages, and natural language paper titles.
107
 
108
  Returns:
109
+ List[str]: A list of author names as strings, or empty list if no authors found.
110
+ Authors are returned in the order they appear in the original source.
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  """
112
+ if not input_data or not input_data.strip():
113
+ return []
114
+
115
  try:
116
+ cleaned_input = input_data.strip()
 
 
 
117
  row_data = create_row_data(cleaned_input)
 
 
118
  result = make_backend_request("infer-authors", row_data)
119
 
120
  # Extract and validate authors from response
 
123
  # Handle comma-separated string format
124
  authors = [author.strip() for author in authors.split(",") if author.strip()]
125
  elif not isinstance(authors, list):
 
126
  authors = []
127
 
128
  # Filter out empty or invalid author names
 
137
  logger.info(f"Successfully inferred {len(valid_authors)} authors from input")
138
  return valid_authors
139
 
 
 
 
140
  except Exception as e:
141
  logger.error(f"Error inferring authors: {e}")
142
  return []
 
147
  Infer the paper URL from various research-related inputs.
148
 
149
  Args:
150
+ input_data (str): A URL, repository link, or other research-related input
151
 
152
  Returns:
153
+ str: The paper URL (typically arXiv or Hugging Face papers), or empty string if not found
154
  """
155
  if not input_data or not input_data.strip():
156
  return ""
 
170
  Infer the code repository URL from research-related inputs.
171
 
172
  Args:
173
+ input_data (str): A URL, paper link, or other research-related input
174
 
175
  Returns:
176
+ str: The code repository URL (typically GitHub), or empty string if not found
177
  """
178
  if not input_data or not input_data.strip():
179
  return ""
 
193
  Infer the research paper or project name from various inputs.
194
 
195
  Args:
196
+ input_data (str): A URL, repository link, or other research-related input
197
 
198
  Returns:
199
+ str: The research name/title, or empty string if not found
200
  """
201
  if not input_data or not input_data.strip():
202
  return ""
 
211
  return ""
212
 
213
 
214
+ def classify_research_url(input_data: str) -> str:
215
  """
216
  Classify the type of research-related URL or input.
217
 
 
219
  or input represents (paper, code, model, dataset, etc.).
220
 
221
  Args:
222
+ input_data (str): The URL or input to classify
223
 
224
  Returns:
225
+ str: The field type: "Paper", "Code", "Space", "Model", "Dataset", "Project", or "Unknown"
 
 
 
 
 
 
 
 
 
 
226
  """
227
+ if not input_data or not input_data.strip():
228
  return "Unknown"
229
 
230
  try:
231
+ result = make_backend_request("infer-field", {"value": input_data})
 
 
 
232
  field = result.get("field", "Unknown")
233
  return field if field else "Unknown"
234
 
 
241
  """
242
  Infer affiliated organizations from research paper or project information.
243
 
244
+ This function attempts to extract organization names from research metadata,
245
+ author affiliations, and repository information using NLP analysis to identify
246
+ institutional affiliations from paper authors and project contributors.
247
+
248
  Args:
249
+ input_data (str): A URL, paper title, or other research-related input
250
 
251
  Returns:
252
+ List[str]: A list of organization names, or empty list if no organizations found
253
  """
254
  if not input_data or not input_data.strip():
255
  return []
 
275
  """
276
  Infer publication date from research paper or project information.
277
 
278
+ This function attempts to extract publication dates from paper metadata,
279
+ repository creation dates, or release information. Returns dates in
280
+ standardized format (YYYY-MM-DD) when possible.
281
+
282
  Args:
283
+ input_data (str): A URL, paper title, or other research-related input
284
 
285
  Returns:
286
+ str: Publication date as string (YYYY-MM-DD format), or empty string if not found
287
  """
288
  if not input_data or not input_data.strip():
289
  return ""
 
302
  """
303
  Infer associated HuggingFace model from research paper or project information.
304
 
305
+ This function attempts to find HuggingFace models associated with research papers,
306
+ GitHub repositories, or project pages. It searches for model references in papers,
307
+ README files, and related documentation.
308
+
309
  Args:
310
+ input_data (str): A URL, paper title, or other research-related input
311
 
312
  Returns:
313
+ str: HuggingFace model URL, or empty string if no model found
314
  """
315
  if not input_data or not input_data.strip():
316
  return ""
 
329
  """
330
  Infer associated HuggingFace dataset from research paper or project information.
331
 
332
+ This function attempts to find HuggingFace datasets used or created by research papers,
333
+ GitHub repositories, or projects. It analyzes paper content, repository documentation,
334
+ and project descriptions.
335
+
336
  Args:
337
+ input_data (str): A URL, paper title, or other research-related input
338
 
339
  Returns:
340
+ str: HuggingFace dataset URL, or empty string if no dataset found
341
  """
342
  if not input_data or not input_data.strip():
343
  return ""
 
356
  """
357
  Infer associated HuggingFace space from research paper or project information.
358
 
359
+ This function attempts to find HuggingFace spaces (demos/applications) associated
360
+ with research papers, models, or GitHub repositories. It looks for interactive
361
+ demos and applications built around research.
362
+
363
  Args:
364
+ input_data (str): A URL, paper title, or other research-related input
365
 
366
  Returns:
367
+ str: HuggingFace space URL, or empty string if no space found
368
  """
369
  if not input_data or not input_data.strip():
370
  return ""
 
383
  """
384
  Infer license information from research repository or project.
385
 
386
+ This function attempts to extract license information from GitHub repositories,
387
+ project documentation, or associated code. It checks license files, repository
388
+ metadata, and project descriptions.
389
+
390
  Args:
391
+ input_data (str): A URL, repository link, or other research-related input
392
 
393
  Returns:
394
+ str: License name/type, or empty string if no license found
395
  """
396
  if not input_data or not input_data.strip():
397
  return ""
 
406
  return ""
407
 
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  def find_research_relationships(input_data: str) -> Dict[str, Any]:
410
  """
411
  Find ALL related research resources across platforms for comprehensive analysis.
 
416
  and understanding the complete ecosystem around a research topic.
417
 
418
  Args:
419
+ input_data (str): A URL, paper title, or other research-related input
420
 
421
  Returns:
422
+ Dict[str, Any]: Dictionary containing all discovered related resources:
423
  {
424
  "paper": str | None, # Associated research paper
425
  "code": str | None, # Code repository URL
 
435
  "success_count": int, # Number of successful inferences
436
  "total_inferences": int # Total inferences attempted
437
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  """
439
+ if not input_data or not input_data.strip():
440
+ return {"error": "Input data cannot be empty", "success_count": 0, "total_inferences": 0}
441
+
442
  try:
443
+ cleaned_input = input_data.strip()
 
444
 
445
  # Initialize result structure
446
  relationships = {
 
497
  logger.info(f"Research relationship analysis completed: {relationships['success_count']}/{relationships['total_inferences']} successful")
498
  return relationships
499
 
 
 
 
500
  except Exception as e:
501
  logger.error(f"Error finding research relationships: {e}")
502
  return {"error": str(e), "success_count": 0, "total_inferences": 0}
503
 
504
 
505
+ # Create minimal Gradio interface focused on MCP tool exposure
506
+ with gr.Blocks(title="Research Tracker MCP Server") as demo:
507
+ gr.Markdown("# Research Tracker MCP Server")
508
+ gr.Markdown("""
509
+ This server provides MCP tools for research inference and metadata extraction.
510
+
511
+ **Available MCP Tools:**
512
+ - `infer_authors` - Extract author names from papers and repositories
513
+ - `infer_paper_url` - Find associated research paper URLs
514
+ - `infer_code_repository` - Discover code repository links
515
+ - `infer_research_name` - Extract research project names
516
+ - `classify_research_url` - Classify URL types (paper/code/model/etc.)
517
+ - `infer_organizations` - Identify affiliated organizations
518
+ - `infer_publication_date` - Extract publication dates
519
+ - `infer_model` - Find associated HuggingFace models
520
+ - `infer_dataset` - Find associated HuggingFace datasets
521
+ - `infer_space` - Find associated HuggingFace spaces
522
+ - `infer_license` - Extract license information
523
+ - `find_research_relationships` - Comprehensive research ecosystem analysis
524
+
525
+ **Input Support:**
526
+ - arXiv paper URLs (https://arxiv.org/abs/...)
527
+ - GitHub repository URLs (https://github.com/...)
528
+ - HuggingFace model/dataset/space URLs
529
+ - Research paper titles and project names
530
+ - Project page URLs
531
+ """)
532
+
533
+ # Expose all core functions as MCP tools
534
+ gr.api(infer_authors)
535
+ gr.api(infer_paper_url)
536
+ gr.api(infer_code_repository)
537
+ gr.api(infer_research_name)
538
+ gr.api(classify_research_url)
539
+ gr.api(infer_organizations)
540
+ gr.api(infer_publication_date)
541
+ gr.api(infer_model)
542
+ gr.api(infer_dataset)
543
+ gr.api(infer_space)
544
+ gr.api(infer_license)
545
+ gr.api(find_research_relationships)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
 
548
  if __name__ == "__main__":
549
+ logger.info("Starting Research Tracker MCP Server")
550
  demo.launch(mcp_server=True, share=False)