dylanebert commited on
Commit
00ab4f8
·
1 Parent(s): b18cfa8

multi-round discovery

Browse files
Files changed (1) hide show
  1. app.py +261 -64
app.py CHANGED
@@ -201,12 +201,112 @@ def cached_request(url: str, timeout: int = REQUEST_TIMEOUT) -> Optional[request
201
  def get_arxiv_id(paper_url: str) -> Optional[str]:
202
  """Extract arXiv ID from paper URL"""
203
  if "arxiv.org/abs/" in paper_url:
204
- return paper_url.split("arxiv.org/abs/")[1]
 
 
205
  elif "huggingface.co/papers" in paper_url:
206
  return paper_url.split("huggingface.co/papers/")[1]
207
  return None
208
 
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def extract_links_from_soup(soup, text):
211
  """Extract both HTML and markdown links from soup and text"""
212
  html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
@@ -217,9 +317,10 @@ def extract_links_from_soup(soup, text):
217
  url_pattern = re.compile(r'https?://[^\s\)]+')
218
  direct_urls = url_pattern.findall(text)
219
 
220
- # Combine all links and remove duplicates
221
  all_links = html_links + markdown_links + direct_urls
222
- return list(set(all_links))
 
223
 
224
 
225
  def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
@@ -370,7 +471,7 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
370
  soup = BeautifulSoup(response.text, "html.parser")
371
  for link in soup.find_all("a"):
372
  href = link.get("href")
373
- if href and ("arxiv" in href or "huggingface.co/papers" in href):
374
  logger.info(f"Paper {href} inferred from Project")
375
  return href
376
  except (ValidationError, ExternalAPIError) as e:
@@ -392,7 +493,7 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
392
  soup = BeautifulSoup(response.text, "html.parser")
393
  links = extract_links_from_soup(soup, response.text)
394
  for link in links:
395
- if link and ("arxiv" in link or "huggingface.co/papers" in link):
396
  logger.info(f"Paper {link} inferred from Code (via GitHub API)")
397
  return link
398
 
@@ -404,7 +505,7 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
404
  soup = BeautifulSoup(response.text, "html.parser")
405
  links = extract_links_from_soup(soup, response.text)
406
  for link in links:
407
- if link and ("arxiv" in link or "huggingface.co/papers" in link):
408
  logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
409
  return link
410
  except (ValidationError, ExternalAPIError):
@@ -489,15 +590,32 @@ def infer_code_from_row(row_data: Dict[str, Any]) -> Optional[str]:
489
  r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
490
  soup = BeautifulSoup(r.text, "html.parser")
491
  links = extract_links_from_soup(soup, r.text)
 
 
 
492
  for link in links:
493
  if link:
494
  try:
495
  url = urlparse(link)
496
  if url.scheme in ["http", "https"] and "github.com" in url.netloc:
497
- logger.info(f"Code {link} inferred from Project")
498
- return link
499
  except Exception:
500
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  except Exception:
502
  pass
503
 
@@ -1020,11 +1138,105 @@ def infer_license(input_data: str) -> str:
1020
  return ""
1021
 
1022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  @rate_limit("mcp_tools")
1024
  def find_research_relationships(input_data: str) -> Dict[str, Any]:
1025
  """
1026
  Find ALL related research resources across platforms for comprehensive analysis.
1027
- Optimized version that scrapes once and uses cached results for all inferences.
1028
 
1029
  This is a comprehensive tool that combines all individual inference tools to provide
1030
  a complete picture of a research project's ecosystem. It discovers:
@@ -1063,85 +1275,70 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
1063
  "total_inferences": 10
1064
  }
1065
 
1066
- # Create row data and get paper URL first
1067
- row_data = create_row_data(cleaned_input)
1068
- paper_url = infer_paper_from_row(row_data)
1069
- if paper_url:
1070
- relationships["paper"] = paper_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
  relationships["success_count"] += 1
1072
- row_data["Paper"] = paper_url # Update row data with found paper
1073
-
1074
- # If we have a HuggingFace paper URL, scrape it once for all resources
1075
- hf_resources = None
1076
- if paper_url and "huggingface.co/papers" in paper_url:
1077
- hf_resources = scrape_huggingface_paper_page(paper_url)
1078
- elif paper_url and "arxiv.org/abs/" in paper_url:
1079
- # Try HuggingFace version
1080
- arxiv_id = get_arxiv_id(paper_url)
1081
- if arxiv_id:
1082
- hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
1083
- hf_resources = scrape_huggingface_paper_page(hf_paper_url)
1084
-
1085
- # Now perform all other inferences efficiently
1086
- # Code inference
1087
- code_url = infer_code_from_row(row_data)
1088
- if not code_url and hf_resources and hf_resources["code"]:
1089
- code_url = hf_resources["code"][0]
1090
- if code_url:
1091
- relationships["code"] = code_url
1092
  relationships["success_count"] += 1
1093
- row_data["Code"] = code_url
1094
 
1095
- # Name inference
1096
- name = infer_name_from_row(row_data)
1097
  if name:
1098
  relationships["name"] = name
1099
  relationships["success_count"] += 1
1100
 
1101
  # Authors inference
1102
- authors = infer_authors_from_row(row_data)
1103
  if authors:
1104
  relationships["authors"] = authors
1105
  relationships["success_count"] += 1
1106
 
1107
  # Date inference
1108
- date = infer_date_from_row(row_data)
1109
  if date:
1110
  relationships["date"] = date
1111
  relationships["success_count"] += 1
1112
 
1113
- # Model inference (use cached HF resources first)
1114
- model_url = None
1115
- if hf_resources and hf_resources["models"]:
1116
- model_url = hf_resources["models"][0]
1117
- else:
1118
- model_url = infer_model_from_row(row_data)
1119
- if model_url:
1120
- relationships["model"] = model_url
1121
  relationships["success_count"] += 1
1122
 
1123
- # Dataset inference (use cached HF resources first)
1124
- dataset_url = None
1125
- if hf_resources and hf_resources["datasets"]:
1126
- dataset_url = hf_resources["datasets"][0]
1127
- else:
1128
- dataset_url = infer_dataset_from_row(row_data)
1129
- if dataset_url:
1130
- relationships["dataset"] = dataset_url
1131
  relationships["success_count"] += 1
1132
 
1133
- # Space inference (use cached HF resources first)
1134
- space_url = None
1135
- if hf_resources and hf_resources["spaces"]:
1136
- space_url = hf_resources["spaces"][0]
1137
- else:
1138
- space_url = infer_space_from_row(row_data)
1139
- if space_url:
1140
- relationships["space"] = space_url
1141
  relationships["success_count"] += 1
1142
 
1143
  # License inference
1144
- license_info = infer_license_from_row(row_data)
1145
  if license_info:
1146
  relationships["license"] = license_info
1147
  relationships["success_count"] += 1
 
201
  def get_arxiv_id(paper_url: str) -> Optional[str]:
202
  """Extract arXiv ID from paper URL"""
203
  if "arxiv.org/abs/" in paper_url:
204
+ return paper_url.split("arxiv.org/abs/")[1].split('.pdf')[0]
205
+ elif "arxiv.org/pdf/" in paper_url:
206
+ return paper_url.split("arxiv.org/pdf/")[1].split('.pdf')[0]
207
  elif "huggingface.co/papers" in paper_url:
208
  return paper_url.split("huggingface.co/papers/")[1]
209
  return None
210
 
211
 
212
+ def clean_url(url):
213
+ """Clean malformed URLs by removing trailing HTML fragments and invalid characters"""
214
+ if not url:
215
+ return url
216
+
217
+ # Remove HTML closing tags and attributes that often get attached
218
+ import re
219
+
220
+ # Remove anything after quote marks followed by HTML-like content
221
+ url = re.sub(r'["\']\s*>.*$', '', url)
222
+
223
+ # Remove trailing HTML fragments
224
+ url = re.sub(r'["\']?\s*</.*$', '', url)
225
+
226
+ # Remove trailing punctuation and whitespace
227
+ url = url.rstrip('",;\'"()<>[] \t\n\r')
228
+
229
+ # Basic URL validation - should start with http/https and contain valid characters
230
+ if not re.match(r'^https?://[^\s<>"\'\[\]{}|\\^`]+$', url):
231
+ return None
232
+
233
+ return url
234
+
235
+
236
+ def is_valid_paper_url(url):
237
+ """Check if a URL is a valid paper URL, excluding badges and non-paper content"""
238
+ if not url:
239
+ return False
240
+
241
+ url_lower = url.lower()
242
+
243
+ # Exclude badges, shields, and other non-paper URLs
244
+ if any(pattern in url_lower for pattern in [
245
+ 'img.shields.io', 'badge', 'logo', 'icon', 'button',
246
+ 'github.com/microsoft/trellis/issues', '/releases/', '/actions/',
247
+ '/wiki/', '/tree/', '/blob/', '.svg', '.png', '.jpg', '.gif'
248
+ ]):
249
+ return False
250
+
251
+ # Valid paper URL patterns
252
+ if any(pattern in url_lower for pattern in [
253
+ 'arxiv.org/abs/', 'arxiv.org/pdf/', 'huggingface.co/papers/'
254
+ ]):
255
+ return True
256
+
257
+ return False
258
+
259
+
260
+ def select_best_github_repo(github_links, context_keywords=None):
261
+ """Select the best GitHub repository from a list of GitHub URLs"""
262
+ if not github_links:
263
+ return None
264
+
265
+ if context_keywords is None:
266
+ context_keywords = []
267
+
268
+ # Score repositories based on various factors
269
+ scored_repos = []
270
+
271
+ for link in github_links:
272
+ if not link:
273
+ continue
274
+
275
+ score = 0
276
+ link_lower = link.lower()
277
+
278
+ # Skip user profiles (github.com/username without repo)
279
+ path_parts = link.split('github.com/')[-1].split('/')
280
+ if len(path_parts) < 2 or not path_parts[1]:
281
+ continue # Skip user profiles
282
+
283
+ # Skip issue/PR/wiki pages - prefer main repo
284
+ if any(x in link_lower for x in ['/issues', '/pull', '/wiki', '/releases', '/actions']):
285
+ score -= 10
286
+
287
+ # Prefer repositories that match context keywords
288
+ for keyword in context_keywords:
289
+ if keyword.lower() in link_lower:
290
+ score += 20
291
+
292
+ # Prefer Microsoft/official org repos if in a Microsoft context
293
+ if 'microsoft' in link_lower and any(k.lower() in link_lower for k in context_keywords):
294
+ score += 15
295
+
296
+ # Prefer main branch/root repo URLs
297
+ if link_lower.endswith('.git') or '/tree/' not in link_lower:
298
+ score += 5
299
+
300
+ scored_repos.append((score, link))
301
+
302
+ if scored_repos:
303
+ # Return the highest scored repository
304
+ scored_repos.sort(key=lambda x: x[0], reverse=True)
305
+ return scored_repos[0][1]
306
+
307
+ return None
308
+
309
+
310
  def extract_links_from_soup(soup, text):
311
  """Extract both HTML and markdown links from soup and text"""
312
  html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
 
317
  url_pattern = re.compile(r'https?://[^\s\)]+')
318
  direct_urls = url_pattern.findall(text)
319
 
320
+ # Combine all links, clean them, and remove duplicates
321
  all_links = html_links + markdown_links + direct_urls
322
+ cleaned_links = [clean_url(link) for link in all_links if link]
323
+ return list(set([link for link in cleaned_links if link]))
324
 
325
 
326
  def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
 
471
  soup = BeautifulSoup(response.text, "html.parser")
472
  for link in soup.find_all("a"):
473
  href = link.get("href")
474
+ if href and is_valid_paper_url(href):
475
  logger.info(f"Paper {href} inferred from Project")
476
  return href
477
  except (ValidationError, ExternalAPIError) as e:
 
493
  soup = BeautifulSoup(response.text, "html.parser")
494
  links = extract_links_from_soup(soup, response.text)
495
  for link in links:
496
+ if link and is_valid_paper_url(link):
497
  logger.info(f"Paper {link} inferred from Code (via GitHub API)")
498
  return link
499
 
 
505
  soup = BeautifulSoup(response.text, "html.parser")
506
  links = extract_links_from_soup(soup, response.text)
507
  for link in links:
508
+ if link and is_valid_paper_url(link):
509
  logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
510
  return link
511
  except (ValidationError, ExternalAPIError):
 
590
  r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
591
  soup = BeautifulSoup(r.text, "html.parser")
592
  links = extract_links_from_soup(soup, r.text)
593
+
594
+ # Filter GitHub links
595
+ github_links = []
596
  for link in links:
597
  if link:
598
  try:
599
  url = urlparse(link)
600
  if url.scheme in ["http", "https"] and "github.com" in url.netloc:
601
+ github_links.append(link)
 
602
  except Exception:
603
  pass
604
+
605
+ if github_links:
606
+ # Extract context keywords from the project page
607
+ context_keywords = []
608
+ if soup.title:
609
+ context_keywords.extend(soup.title.get_text().split())
610
+
611
+ # Use URL parts as context
612
+ project_url_parts = row_data["Project"].split('/')
613
+ context_keywords.extend([part for part in project_url_parts if part and len(part) > 2])
614
+
615
+ best_repo = select_best_github_repo(github_links, context_keywords)
616
+ if best_repo:
617
+ logger.info(f"Code {best_repo} inferred from Project")
618
+ return best_repo
619
  except Exception:
620
  pass
621
 
 
1138
  return ""
1139
 
1140
 
1141
+ def discover_all_urls(input_data: str) -> Dict[str, Any]:
1142
+ """
1143
+ Discover ALL related URLs from the input by building a complete resource graph.
1144
+ This performs multiple rounds of discovery to find all interconnected resources.
1145
+ """
1146
+ discovered = {
1147
+ "paper": None,
1148
+ "code": None,
1149
+ "project": None,
1150
+ "model": None,
1151
+ "dataset": None,
1152
+ "space": None,
1153
+ "hf_resources": None
1154
+ }
1155
+
1156
+ # Initialize with input
1157
+ row_data = create_row_data(input_data.strip())
1158
+
1159
+ # Round 1: Direct inferences from input
1160
+ if row_data.get("Paper"):
1161
+ discovered["paper"] = row_data["Paper"]
1162
+ if row_data.get("Code"):
1163
+ discovered["code"] = row_data["Code"]
1164
+ if row_data.get("Project"):
1165
+ discovered["project"] = row_data["Project"]
1166
+ if row_data.get("Model"):
1167
+ discovered["model"] = row_data["Model"]
1168
+ if row_data.get("Dataset"):
1169
+ discovered["dataset"] = row_data["Dataset"]
1170
+ if row_data.get("Space"):
1171
+ discovered["space"] = row_data["Space"]
1172
+
1173
+ # Round 2: Cross-inferences - keep discovering until no new URLs found
1174
+ max_rounds = 3
1175
+ for round_num in range(max_rounds):
1176
+ found_new = False
1177
+
1178
+ # Try to find paper from code if we have code but no paper
1179
+ if discovered["code"] and not discovered["paper"]:
1180
+ temp_row = {"Code": discovered["code"], "Paper": None, "Project": discovered["project"]}
1181
+ paper = infer_paper_from_row(temp_row)
1182
+ if paper and paper != discovered["paper"]:
1183
+ discovered["paper"] = paper
1184
+ found_new = True
1185
+
1186
+ # Try to find code from paper if we have paper but no code
1187
+ if discovered["paper"] and not discovered["code"]:
1188
+ temp_row = {"Paper": discovered["paper"], "Code": None, "Project": discovered["project"]}
1189
+ code = infer_code_from_row(temp_row)
1190
+ if code and code != discovered["code"]:
1191
+ discovered["code"] = code
1192
+ found_new = True
1193
+
1194
+ # Try to find code from project if we have project but no code
1195
+ if discovered["project"] and not discovered["code"]:
1196
+ temp_row = {"Project": discovered["project"], "Code": None, "Paper": discovered["paper"]}
1197
+ code = infer_code_from_row(temp_row)
1198
+ if code and code != discovered["code"]:
1199
+ discovered["code"] = code
1200
+ found_new = True
1201
+
1202
+ # Scrape HuggingFace paper page for additional resources
1203
+ if discovered["paper"] and not discovered["hf_resources"]:
1204
+ arxiv_id = get_arxiv_id(discovered["paper"])
1205
+ if "huggingface.co/papers" in discovered["paper"]:
1206
+ discovered["hf_resources"] = scrape_huggingface_paper_page(discovered["paper"])
1207
+ found_new = True
1208
+ elif arxiv_id:
1209
+ hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
1210
+ discovered["hf_resources"] = scrape_huggingface_paper_page(hf_paper_url)
1211
+ if discovered["hf_resources"] and any(discovered["hf_resources"].values()):
1212
+ found_new = True
1213
+
1214
+ # Extract additional resources from HF scraping
1215
+ if discovered["hf_resources"]:
1216
+ if not discovered["model"] and discovered["hf_resources"]["models"]:
1217
+ discovered["model"] = discovered["hf_resources"]["models"][0]
1218
+ found_new = True
1219
+ if not discovered["dataset"] and discovered["hf_resources"]["datasets"]:
1220
+ discovered["dataset"] = discovered["hf_resources"]["datasets"][0]
1221
+ found_new = True
1222
+ if not discovered["space"] and discovered["hf_resources"]["spaces"]:
1223
+ discovered["space"] = discovered["hf_resources"]["spaces"][0]
1224
+ found_new = True
1225
+ if not discovered["code"] and discovered["hf_resources"]["code"]:
1226
+ discovered["code"] = discovered["hf_resources"]["code"][0]
1227
+ found_new = True
1228
+
1229
+ if not found_new:
1230
+ break
1231
+
1232
+ return discovered
1233
+
1234
+
1235
  @rate_limit("mcp_tools")
1236
  def find_research_relationships(input_data: str) -> Dict[str, Any]:
1237
  """
1238
  Find ALL related research resources across platforms for comprehensive analysis.
1239
+ Uses a multi-round discovery approach to build a complete resource graph.
1240
 
1241
  This is a comprehensive tool that combines all individual inference tools to provide
1242
  a complete picture of a research project's ecosystem. It discovers:
 
1275
  "total_inferences": 10
1276
  }
1277
 
1278
+ # Phase 1: Discover all URLs by building complete resource graph
1279
+ discovered_urls = discover_all_urls(cleaned_input)
1280
+
1281
+ # Phase 2: Create comprehensive row data with all discovered URLs
1282
+ complete_row_data = {
1283
+ "Name": None,
1284
+ "Authors": [],
1285
+ "Paper": discovered_urls["paper"],
1286
+ "Code": discovered_urls["code"],
1287
+ "Project": discovered_urls["project"],
1288
+ "Space": discovered_urls["space"],
1289
+ "Model": discovered_urls["model"],
1290
+ "Dataset": discovered_urls["dataset"],
1291
+ "Orgs": [],
1292
+ "License": None,
1293
+ "Date": None,
1294
+ }
1295
+
1296
+ # Phase 3: Perform all inferences using complete information
1297
+ # Paper
1298
+ if complete_row_data["Paper"]:
1299
+ relationships["paper"] = complete_row_data["Paper"]
1300
  relationships["success_count"] += 1
1301
+
1302
+ # Code
1303
+ if complete_row_data["Code"]:
1304
+ relationships["code"] = complete_row_data["Code"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1305
  relationships["success_count"] += 1
 
1306
 
1307
+ # Name inference (try all available sources)
1308
+ name = infer_name_from_row(complete_row_data)
1309
  if name:
1310
  relationships["name"] = name
1311
  relationships["success_count"] += 1
1312
 
1313
  # Authors inference
1314
+ authors = infer_authors_from_row(complete_row_data)
1315
  if authors:
1316
  relationships["authors"] = authors
1317
  relationships["success_count"] += 1
1318
 
1319
  # Date inference
1320
+ date = infer_date_from_row(complete_row_data)
1321
  if date:
1322
  relationships["date"] = date
1323
  relationships["success_count"] += 1
1324
 
1325
+ # Model
1326
+ if complete_row_data["Model"]:
1327
+ relationships["model"] = complete_row_data["Model"]
 
 
 
 
 
1328
  relationships["success_count"] += 1
1329
 
1330
+ # Dataset
1331
+ if complete_row_data["Dataset"]:
1332
+ relationships["dataset"] = complete_row_data["Dataset"]
 
 
 
 
 
1333
  relationships["success_count"] += 1
1334
 
1335
+ # Space
1336
+ if complete_row_data["Space"]:
1337
+ relationships["space"] = complete_row_data["Space"]
 
 
 
 
 
1338
  relationships["success_count"] += 1
1339
 
1340
  # License inference
1341
+ license_info = infer_license_from_row(complete_row_data)
1342
  if license_info:
1343
  relationships["license"] = license_info
1344
  relationships["success_count"] += 1