dylanebert
commited on
Commit
·
b935477
1
Parent(s):
efc2e3f
app.py
CHANGED
@@ -55,7 +55,8 @@ ALLOWED_DOMAINS = {
|
|
55 |
"arxiv.org",
|
56 |
"huggingface.co",
|
57 |
"github.com",
|
58 |
-
"github.io"
|
|
|
59 |
}
|
60 |
|
61 |
if not HF_TOKEN:
|
@@ -211,7 +212,14 @@ def extract_links_from_soup(soup, text):
|
|
211 |
html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
|
212 |
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
|
213 |
markdown_links = link_pattern.findall(text)
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
|
217 |
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
@@ -369,22 +377,39 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
369 |
logger.debug(f"Failed to scrape project page: {e}")
|
370 |
|
371 |
# Try GitHub README parsing
|
372 |
-
if row_data.get("Code") is not None and
|
373 |
try:
|
374 |
repo = row_data["Code"].split("github.com/")[1]
|
375 |
-
|
376 |
-
|
377 |
-
if
|
378 |
-
|
379 |
-
if
|
380 |
-
|
381 |
-
if
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
except Exception:
|
389 |
pass
|
390 |
|
@@ -717,7 +742,7 @@ def infer_authors(input_data: str) -> List[str]:
|
|
717 |
Examples:
|
718 |
- "https://arxiv.org/abs/2103.00020"
|
719 |
- "https://huggingface.co/papers/2103.00020"
|
720 |
-
- "CLIP
|
721 |
|
722 |
Returns:
|
723 |
List[str]: A list of author names as strings, or empty list if no authors found.
|
@@ -1192,10 +1217,10 @@ with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
|
1192 |
with gr.Row():
|
1193 |
with gr.Column():
|
1194 |
input_text = gr.Textbox(
|
1195 |
-
label="Demo Input
|
1196 |
placeholder="https://arxiv.org/abs/2506.18787",
|
1197 |
lines=2,
|
1198 |
-
info="
|
1199 |
)
|
1200 |
submit_btn = gr.Button("🔍 Demonstrate find_research_relationships", variant="primary")
|
1201 |
|
|
|
55 |
"arxiv.org",
|
56 |
"huggingface.co",
|
57 |
"github.com",
|
58 |
+
"github.io",
|
59 |
+
"raw.githubusercontent.com"
|
60 |
}
|
61 |
|
62 |
if not HF_TOKEN:
|
|
|
212 |
html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
|
213 |
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
|
214 |
markdown_links = link_pattern.findall(text)
|
215 |
+
|
216 |
+
# Also extract direct URLs that aren't in markdown format
|
217 |
+
url_pattern = re.compile(r'https?://[^\s\)]+')
|
218 |
+
direct_urls = url_pattern.findall(text)
|
219 |
+
|
220 |
+
# Combine all links and remove duplicates
|
221 |
+
all_links = html_links + markdown_links + direct_urls
|
222 |
+
return list(set(all_links))
|
223 |
|
224 |
|
225 |
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
|
|
377 |
logger.debug(f"Failed to scrape project page: {e}")
|
378 |
|
379 |
# Try GitHub README parsing
|
380 |
+
if row_data.get("Code") is not None and "github.com" in row_data["Code"]:
|
381 |
try:
|
382 |
repo = row_data["Code"].split("github.com/")[1]
|
383 |
+
|
384 |
+
# First try with GitHub API if available
|
385 |
+
if GITHUB_AUTH:
|
386 |
+
readme_response = make_github_request(f"/repos/{repo}/readme")
|
387 |
+
if readme_response:
|
388 |
+
readme = readme_response.json()
|
389 |
+
if readme.get("type") == "file" and readme.get("download_url"):
|
390 |
+
response = cached_request(readme["download_url"])
|
391 |
+
if response:
|
392 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
393 |
+
links = extract_links_from_soup(soup, response.text)
|
394 |
+
for link in links:
|
395 |
+
if link and ("arxiv" in link or "huggingface.co/papers" in link):
|
396 |
+
logger.info(f"Paper {link} inferred from Code (via GitHub API)")
|
397 |
+
return link
|
398 |
+
|
399 |
+
# Fallback: try scraping the GitHub page directly
|
400 |
+
try:
|
401 |
+
github_url = row_data["Code"]
|
402 |
+
response = cached_request(github_url)
|
403 |
+
if response:
|
404 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
405 |
+
links = extract_links_from_soup(soup, response.text)
|
406 |
+
for link in links:
|
407 |
+
if link and ("arxiv" in link or "huggingface.co/papers" in link):
|
408 |
+
logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
|
409 |
+
return link
|
410 |
+
except (ValidationError, ExternalAPIError):
|
411 |
+
pass
|
412 |
+
|
413 |
except Exception:
|
414 |
pass
|
415 |
|
|
|
742 |
Examples:
|
743 |
- "https://arxiv.org/abs/2103.00020"
|
744 |
- "https://huggingface.co/papers/2103.00020"
|
745 |
+
- "https://github.com/openai/CLIP"
|
746 |
|
747 |
Returns:
|
748 |
List[str]: A list of author names as strings, or empty list if no authors found.
|
|
|
1217 |
with gr.Row():
|
1218 |
with gr.Column():
|
1219 |
input_text = gr.Textbox(
|
1220 |
+
label="Demo Input",
|
1221 |
placeholder="https://arxiv.org/abs/2506.18787",
|
1222 |
lines=2,
|
1223 |
+
info="Paper URL, repository URL, or project page"
|
1224 |
)
|
1225 |
submit_btn = gr.Button("🔍 Demonstrate find_research_relationships", variant="primary")
|
1226 |
|