dylanebert commited on
Commit
b935477
·
1 Parent(s): efc2e3f
Files changed (1) hide show
  1. app.py +44 -19
app.py CHANGED
@@ -55,7 +55,8 @@ ALLOWED_DOMAINS = {
55
  "arxiv.org",
56
  "huggingface.co",
57
  "github.com",
58
- "github.io"
 
59
  }
60
 
61
  if not HF_TOKEN:
@@ -211,7 +212,14 @@ def extract_links_from_soup(soup, text):
211
  html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
212
  link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
213
  markdown_links = link_pattern.findall(text)
214
- return html_links + markdown_links
 
 
 
 
 
 
 
215
 
216
 
217
  def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
@@ -369,22 +377,39 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
369
  logger.debug(f"Failed to scrape project page: {e}")
370
 
371
  # Try GitHub README parsing
372
- if row_data.get("Code") is not None and GITHUB_AUTH and "github.com" in row_data["Code"]:
373
  try:
374
  repo = row_data["Code"].split("github.com/")[1]
375
- # GitHub API requires special handling
376
- r = make_github_request(f"/repos/{repo}/readme")
377
- if r:
378
- readme = r.json()
379
- if readme.get("type") == "file" and readme.get("download_url"):
380
- response = cached_request(readme["download_url"])
381
- if response:
382
- soup = BeautifulSoup(response.text, "html.parser")
383
- links = extract_links_from_soup(soup, r.text)
384
- for link in links:
385
- if link and ("arxiv" in link or "huggingface.co/papers" in link):
386
- logger.info(f"Paper {link} inferred from Code")
387
- return link
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  except Exception:
389
  pass
390
 
@@ -717,7 +742,7 @@ def infer_authors(input_data: str) -> List[str]:
717
  Examples:
718
  - "https://arxiv.org/abs/2103.00020"
719
  - "https://huggingface.co/papers/2103.00020"
720
- - "CLIP: Connecting Text and Images"
721
 
722
  Returns:
723
  List[str]: A list of author names as strings, or empty list if no authors found.
@@ -1192,10 +1217,10 @@ with gr.Blocks(title="Research Tracker MCP Server") as demo:
1192
  with gr.Row():
1193
  with gr.Column():
1194
  input_text = gr.Textbox(
1195
- label="Demo Input (Paper URL, Repository, or Research Name)",
1196
  placeholder="https://arxiv.org/abs/2506.18787",
1197
  lines=2,
1198
- info="Try: arXiv URLs, HuggingFace paper URLs, GitHub repos, or research titles"
1199
  )
1200
  submit_btn = gr.Button("🔍 Demonstrate find_research_relationships", variant="primary")
1201
 
 
55
  "arxiv.org",
56
  "huggingface.co",
57
  "github.com",
58
+ "github.io",
59
+ "raw.githubusercontent.com"
60
  }
61
 
62
  if not HF_TOKEN:
 
212
  html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
213
  link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
214
  markdown_links = link_pattern.findall(text)
215
+
216
+ # Also extract direct URLs that aren't in markdown format
217
+ url_pattern = re.compile(r'https?://[^\s\)]+')
218
+ direct_urls = url_pattern.findall(text)
219
+
220
+ # Combine all links and remove duplicates
221
+ all_links = html_links + markdown_links + direct_urls
222
+ return list(set(all_links))
223
 
224
 
225
  def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
 
377
  logger.debug(f"Failed to scrape project page: {e}")
378
 
379
  # Try GitHub README parsing
380
+ if row_data.get("Code") is not None and "github.com" in row_data["Code"]:
381
  try:
382
  repo = row_data["Code"].split("github.com/")[1]
383
+
384
+ # First try with GitHub API if available
385
+ if GITHUB_AUTH:
386
+ readme_response = make_github_request(f"/repos/{repo}/readme")
387
+ if readme_response:
388
+ readme = readme_response.json()
389
+ if readme.get("type") == "file" and readme.get("download_url"):
390
+ response = cached_request(readme["download_url"])
391
+ if response:
392
+ soup = BeautifulSoup(response.text, "html.parser")
393
+ links = extract_links_from_soup(soup, response.text)
394
+ for link in links:
395
+ if link and ("arxiv" in link or "huggingface.co/papers" in link):
396
+ logger.info(f"Paper {link} inferred from Code (via GitHub API)")
397
+ return link
398
+
399
+ # Fallback: try scraping the GitHub page directly
400
+ try:
401
+ github_url = row_data["Code"]
402
+ response = cached_request(github_url)
403
+ if response:
404
+ soup = BeautifulSoup(response.text, "html.parser")
405
+ links = extract_links_from_soup(soup, response.text)
406
+ for link in links:
407
+ if link and ("arxiv" in link or "huggingface.co/papers" in link):
408
+ logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
409
+ return link
410
+ except (ValidationError, ExternalAPIError):
411
+ pass
412
+
413
  except Exception:
414
  pass
415
 
 
742
  Examples:
743
  - "https://arxiv.org/abs/2103.00020"
744
  - "https://huggingface.co/papers/2103.00020"
745
+ - "https://github.com/openai/CLIP"
746
 
747
  Returns:
748
  List[str]: A list of author names as strings, or empty list if no authors found.
 
1217
  with gr.Row():
1218
  with gr.Column():
1219
  input_text = gr.Textbox(
1220
+ label="Demo Input",
1221
  placeholder="https://arxiv.org/abs/2506.18787",
1222
  lines=2,
1223
+ info="Paper URL, repository URL, or project page"
1224
  )
1225
  submit_btn = gr.Button("🔍 Demonstrate find_research_relationships", variant="primary")
1226