Spaces:
Running
Running
dylanebert
commited on
Commit
·
00ab4f8
1
Parent(s):
b18cfa8
multi-round discovery
Browse files
app.py
CHANGED
@@ -201,12 +201,112 @@ def cached_request(url: str, timeout: int = REQUEST_TIMEOUT) -> Optional[request
|
|
201 |
def get_arxiv_id(paper_url: str) -> Optional[str]:
|
202 |
"""Extract arXiv ID from paper URL"""
|
203 |
if "arxiv.org/abs/" in paper_url:
|
204 |
-
return paper_url.split("arxiv.org/abs/")[1]
|
|
|
|
|
205 |
elif "huggingface.co/papers" in paper_url:
|
206 |
return paper_url.split("huggingface.co/papers/")[1]
|
207 |
return None
|
208 |
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
def extract_links_from_soup(soup, text):
|
211 |
"""Extract both HTML and markdown links from soup and text"""
|
212 |
html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
|
@@ -217,9 +317,10 @@ def extract_links_from_soup(soup, text):
|
|
217 |
url_pattern = re.compile(r'https?://[^\s\)]+')
|
218 |
direct_urls = url_pattern.findall(text)
|
219 |
|
220 |
-
# Combine all links and remove duplicates
|
221 |
all_links = html_links + markdown_links + direct_urls
|
222 |
-
|
|
|
223 |
|
224 |
|
225 |
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
@@ -370,7 +471,7 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
370 |
soup = BeautifulSoup(response.text, "html.parser")
|
371 |
for link in soup.find_all("a"):
|
372 |
href = link.get("href")
|
373 |
-
if href and (
|
374 |
logger.info(f"Paper {href} inferred from Project")
|
375 |
return href
|
376 |
except (ValidationError, ExternalAPIError) as e:
|
@@ -392,7 +493,7 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
392 |
soup = BeautifulSoup(response.text, "html.parser")
|
393 |
links = extract_links_from_soup(soup, response.text)
|
394 |
for link in links:
|
395 |
-
if link and (
|
396 |
logger.info(f"Paper {link} inferred from Code (via GitHub API)")
|
397 |
return link
|
398 |
|
@@ -404,7 +505,7 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
404 |
soup = BeautifulSoup(response.text, "html.parser")
|
405 |
links = extract_links_from_soup(soup, response.text)
|
406 |
for link in links:
|
407 |
-
if link and (
|
408 |
logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
|
409 |
return link
|
410 |
except (ValidationError, ExternalAPIError):
|
@@ -489,15 +590,32 @@ def infer_code_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
489 |
r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
|
490 |
soup = BeautifulSoup(r.text, "html.parser")
|
491 |
links = extract_links_from_soup(soup, r.text)
|
|
|
|
|
|
|
492 |
for link in links:
|
493 |
if link:
|
494 |
try:
|
495 |
url = urlparse(link)
|
496 |
if url.scheme in ["http", "https"] and "github.com" in url.netloc:
|
497 |
-
|
498 |
-
return link
|
499 |
except Exception:
|
500 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
except Exception:
|
502 |
pass
|
503 |
|
@@ -1020,11 +1138,105 @@ def infer_license(input_data: str) -> str:
|
|
1020 |
return ""
|
1021 |
|
1022 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1023 |
@rate_limit("mcp_tools")
|
1024 |
def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
1025 |
"""
|
1026 |
Find ALL related research resources across platforms for comprehensive analysis.
|
1027 |
-
|
1028 |
|
1029 |
This is a comprehensive tool that combines all individual inference tools to provide
|
1030 |
a complete picture of a research project's ecosystem. It discovers:
|
@@ -1063,85 +1275,70 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
1063 |
"total_inferences": 10
|
1064 |
}
|
1065 |
|
1066 |
-
#
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1071 |
relationships["success_count"] += 1
|
1072 |
-
|
1073 |
-
|
1074 |
-
|
1075 |
-
|
1076 |
-
if paper_url and "huggingface.co/papers" in paper_url:
|
1077 |
-
hf_resources = scrape_huggingface_paper_page(paper_url)
|
1078 |
-
elif paper_url and "arxiv.org/abs/" in paper_url:
|
1079 |
-
# Try HuggingFace version
|
1080 |
-
arxiv_id = get_arxiv_id(paper_url)
|
1081 |
-
if arxiv_id:
|
1082 |
-
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
1083 |
-
hf_resources = scrape_huggingface_paper_page(hf_paper_url)
|
1084 |
-
|
1085 |
-
# Now perform all other inferences efficiently
|
1086 |
-
# Code inference
|
1087 |
-
code_url = infer_code_from_row(row_data)
|
1088 |
-
if not code_url and hf_resources and hf_resources["code"]:
|
1089 |
-
code_url = hf_resources["code"][0]
|
1090 |
-
if code_url:
|
1091 |
-
relationships["code"] = code_url
|
1092 |
relationships["success_count"] += 1
|
1093 |
-
row_data["Code"] = code_url
|
1094 |
|
1095 |
-
# Name inference
|
1096 |
-
name = infer_name_from_row(
|
1097 |
if name:
|
1098 |
relationships["name"] = name
|
1099 |
relationships["success_count"] += 1
|
1100 |
|
1101 |
# Authors inference
|
1102 |
-
authors = infer_authors_from_row(
|
1103 |
if authors:
|
1104 |
relationships["authors"] = authors
|
1105 |
relationships["success_count"] += 1
|
1106 |
|
1107 |
# Date inference
|
1108 |
-
date = infer_date_from_row(
|
1109 |
if date:
|
1110 |
relationships["date"] = date
|
1111 |
relationships["success_count"] += 1
|
1112 |
|
1113 |
-
# Model
|
1114 |
-
|
1115 |
-
|
1116 |
-
model_url = hf_resources["models"][0]
|
1117 |
-
else:
|
1118 |
-
model_url = infer_model_from_row(row_data)
|
1119 |
-
if model_url:
|
1120 |
-
relationships["model"] = model_url
|
1121 |
relationships["success_count"] += 1
|
1122 |
|
1123 |
-
# Dataset
|
1124 |
-
|
1125 |
-
|
1126 |
-
dataset_url = hf_resources["datasets"][0]
|
1127 |
-
else:
|
1128 |
-
dataset_url = infer_dataset_from_row(row_data)
|
1129 |
-
if dataset_url:
|
1130 |
-
relationships["dataset"] = dataset_url
|
1131 |
relationships["success_count"] += 1
|
1132 |
|
1133 |
-
# Space
|
1134 |
-
|
1135 |
-
|
1136 |
-
space_url = hf_resources["spaces"][0]
|
1137 |
-
else:
|
1138 |
-
space_url = infer_space_from_row(row_data)
|
1139 |
-
if space_url:
|
1140 |
-
relationships["space"] = space_url
|
1141 |
relationships["success_count"] += 1
|
1142 |
|
1143 |
# License inference
|
1144 |
-
license_info = infer_license_from_row(
|
1145 |
if license_info:
|
1146 |
relationships["license"] = license_info
|
1147 |
relationships["success_count"] += 1
|
|
|
201 |
def get_arxiv_id(paper_url: str) -> Optional[str]:
|
202 |
"""Extract arXiv ID from paper URL"""
|
203 |
if "arxiv.org/abs/" in paper_url:
|
204 |
+
return paper_url.split("arxiv.org/abs/")[1].split('.pdf')[0]
|
205 |
+
elif "arxiv.org/pdf/" in paper_url:
|
206 |
+
return paper_url.split("arxiv.org/pdf/")[1].split('.pdf')[0]
|
207 |
elif "huggingface.co/papers" in paper_url:
|
208 |
return paper_url.split("huggingface.co/papers/")[1]
|
209 |
return None
|
210 |
|
211 |
|
212 |
+
def clean_url(url):
|
213 |
+
"""Clean malformed URLs by removing trailing HTML fragments and invalid characters"""
|
214 |
+
if not url:
|
215 |
+
return url
|
216 |
+
|
217 |
+
# Remove HTML closing tags and attributes that often get attached
|
218 |
+
import re
|
219 |
+
|
220 |
+
# Remove anything after quote marks followed by HTML-like content
|
221 |
+
url = re.sub(r'["\']\s*>.*$', '', url)
|
222 |
+
|
223 |
+
# Remove trailing HTML fragments
|
224 |
+
url = re.sub(r'["\']?\s*</.*$', '', url)
|
225 |
+
|
226 |
+
# Remove trailing punctuation and whitespace
|
227 |
+
url = url.rstrip('",;\'"()<>[] \t\n\r')
|
228 |
+
|
229 |
+
# Basic URL validation - should start with http/https and contain valid characters
|
230 |
+
if not re.match(r'^https?://[^\s<>"\'\[\]{}|\\^`]+$', url):
|
231 |
+
return None
|
232 |
+
|
233 |
+
return url
|
234 |
+
|
235 |
+
|
236 |
+
def is_valid_paper_url(url):
|
237 |
+
"""Check if a URL is a valid paper URL, excluding badges and non-paper content"""
|
238 |
+
if not url:
|
239 |
+
return False
|
240 |
+
|
241 |
+
url_lower = url.lower()
|
242 |
+
|
243 |
+
# Exclude badges, shields, and other non-paper URLs
|
244 |
+
if any(pattern in url_lower for pattern in [
|
245 |
+
'img.shields.io', 'badge', 'logo', 'icon', 'button',
|
246 |
+
'github.com/microsoft/trellis/issues', '/releases/', '/actions/',
|
247 |
+
'/wiki/', '/tree/', '/blob/', '.svg', '.png', '.jpg', '.gif'
|
248 |
+
]):
|
249 |
+
return False
|
250 |
+
|
251 |
+
# Valid paper URL patterns
|
252 |
+
if any(pattern in url_lower for pattern in [
|
253 |
+
'arxiv.org/abs/', 'arxiv.org/pdf/', 'huggingface.co/papers/'
|
254 |
+
]):
|
255 |
+
return True
|
256 |
+
|
257 |
+
return False
|
258 |
+
|
259 |
+
|
260 |
+
def select_best_github_repo(github_links, context_keywords=None):
|
261 |
+
"""Select the best GitHub repository from a list of GitHub URLs"""
|
262 |
+
if not github_links:
|
263 |
+
return None
|
264 |
+
|
265 |
+
if context_keywords is None:
|
266 |
+
context_keywords = []
|
267 |
+
|
268 |
+
# Score repositories based on various factors
|
269 |
+
scored_repos = []
|
270 |
+
|
271 |
+
for link in github_links:
|
272 |
+
if not link:
|
273 |
+
continue
|
274 |
+
|
275 |
+
score = 0
|
276 |
+
link_lower = link.lower()
|
277 |
+
|
278 |
+
# Skip user profiles (github.com/username without repo)
|
279 |
+
path_parts = link.split('github.com/')[-1].split('/')
|
280 |
+
if len(path_parts) < 2 or not path_parts[1]:
|
281 |
+
continue # Skip user profiles
|
282 |
+
|
283 |
+
# Skip issue/PR/wiki pages - prefer main repo
|
284 |
+
if any(x in link_lower for x in ['/issues', '/pull', '/wiki', '/releases', '/actions']):
|
285 |
+
score -= 10
|
286 |
+
|
287 |
+
# Prefer repositories that match context keywords
|
288 |
+
for keyword in context_keywords:
|
289 |
+
if keyword.lower() in link_lower:
|
290 |
+
score += 20
|
291 |
+
|
292 |
+
# Prefer Microsoft/official org repos if in a Microsoft context
|
293 |
+
if 'microsoft' in link_lower and any(k.lower() in link_lower for k in context_keywords):
|
294 |
+
score += 15
|
295 |
+
|
296 |
+
# Prefer main branch/root repo URLs
|
297 |
+
if link_lower.endswith('.git') or '/tree/' not in link_lower:
|
298 |
+
score += 5
|
299 |
+
|
300 |
+
scored_repos.append((score, link))
|
301 |
+
|
302 |
+
if scored_repos:
|
303 |
+
# Return the highest scored repository
|
304 |
+
scored_repos.sort(key=lambda x: x[0], reverse=True)
|
305 |
+
return scored_repos[0][1]
|
306 |
+
|
307 |
+
return None
|
308 |
+
|
309 |
+
|
310 |
def extract_links_from_soup(soup, text):
|
311 |
"""Extract both HTML and markdown links from soup and text"""
|
312 |
html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
|
|
|
317 |
url_pattern = re.compile(r'https?://[^\s\)]+')
|
318 |
direct_urls = url_pattern.findall(text)
|
319 |
|
320 |
+
# Combine all links, clean them, and remove duplicates
|
321 |
all_links = html_links + markdown_links + direct_urls
|
322 |
+
cleaned_links = [clean_url(link) for link in all_links if link]
|
323 |
+
return list(set([link for link in cleaned_links if link]))
|
324 |
|
325 |
|
326 |
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
|
|
471 |
soup = BeautifulSoup(response.text, "html.parser")
|
472 |
for link in soup.find_all("a"):
|
473 |
href = link.get("href")
|
474 |
+
if href and is_valid_paper_url(href):
|
475 |
logger.info(f"Paper {href} inferred from Project")
|
476 |
return href
|
477 |
except (ValidationError, ExternalAPIError) as e:
|
|
|
493 |
soup = BeautifulSoup(response.text, "html.parser")
|
494 |
links = extract_links_from_soup(soup, response.text)
|
495 |
for link in links:
|
496 |
+
if link and is_valid_paper_url(link):
|
497 |
logger.info(f"Paper {link} inferred from Code (via GitHub API)")
|
498 |
return link
|
499 |
|
|
|
505 |
soup = BeautifulSoup(response.text, "html.parser")
|
506 |
links = extract_links_from_soup(soup, response.text)
|
507 |
for link in links:
|
508 |
+
if link and is_valid_paper_url(link):
|
509 |
logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
|
510 |
return link
|
511 |
except (ValidationError, ExternalAPIError):
|
|
|
590 |
r = requests.get(row_data["Project"], timeout=REQUEST_TIMEOUT)
|
591 |
soup = BeautifulSoup(r.text, "html.parser")
|
592 |
links = extract_links_from_soup(soup, r.text)
|
593 |
+
|
594 |
+
# Filter GitHub links
|
595 |
+
github_links = []
|
596 |
for link in links:
|
597 |
if link:
|
598 |
try:
|
599 |
url = urlparse(link)
|
600 |
if url.scheme in ["http", "https"] and "github.com" in url.netloc:
|
601 |
+
github_links.append(link)
|
|
|
602 |
except Exception:
|
603 |
pass
|
604 |
+
|
605 |
+
if github_links:
|
606 |
+
# Extract context keywords from the project page
|
607 |
+
context_keywords = []
|
608 |
+
if soup.title:
|
609 |
+
context_keywords.extend(soup.title.get_text().split())
|
610 |
+
|
611 |
+
# Use URL parts as context
|
612 |
+
project_url_parts = row_data["Project"].split('/')
|
613 |
+
context_keywords.extend([part for part in project_url_parts if part and len(part) > 2])
|
614 |
+
|
615 |
+
best_repo = select_best_github_repo(github_links, context_keywords)
|
616 |
+
if best_repo:
|
617 |
+
logger.info(f"Code {best_repo} inferred from Project")
|
618 |
+
return best_repo
|
619 |
except Exception:
|
620 |
pass
|
621 |
|
|
|
1138 |
return ""
|
1139 |
|
1140 |
|
1141 |
+
def discover_all_urls(input_data: str) -> Dict[str, Any]:
|
1142 |
+
"""
|
1143 |
+
Discover ALL related URLs from the input by building a complete resource graph.
|
1144 |
+
This performs multiple rounds of discovery to find all interconnected resources.
|
1145 |
+
"""
|
1146 |
+
discovered = {
|
1147 |
+
"paper": None,
|
1148 |
+
"code": None,
|
1149 |
+
"project": None,
|
1150 |
+
"model": None,
|
1151 |
+
"dataset": None,
|
1152 |
+
"space": None,
|
1153 |
+
"hf_resources": None
|
1154 |
+
}
|
1155 |
+
|
1156 |
+
# Initialize with input
|
1157 |
+
row_data = create_row_data(input_data.strip())
|
1158 |
+
|
1159 |
+
# Round 1: Direct inferences from input
|
1160 |
+
if row_data.get("Paper"):
|
1161 |
+
discovered["paper"] = row_data["Paper"]
|
1162 |
+
if row_data.get("Code"):
|
1163 |
+
discovered["code"] = row_data["Code"]
|
1164 |
+
if row_data.get("Project"):
|
1165 |
+
discovered["project"] = row_data["Project"]
|
1166 |
+
if row_data.get("Model"):
|
1167 |
+
discovered["model"] = row_data["Model"]
|
1168 |
+
if row_data.get("Dataset"):
|
1169 |
+
discovered["dataset"] = row_data["Dataset"]
|
1170 |
+
if row_data.get("Space"):
|
1171 |
+
discovered["space"] = row_data["Space"]
|
1172 |
+
|
1173 |
+
# Round 2: Cross-inferences - keep discovering until no new URLs found
|
1174 |
+
max_rounds = 3
|
1175 |
+
for round_num in range(max_rounds):
|
1176 |
+
found_new = False
|
1177 |
+
|
1178 |
+
# Try to find paper from code if we have code but no paper
|
1179 |
+
if discovered["code"] and not discovered["paper"]:
|
1180 |
+
temp_row = {"Code": discovered["code"], "Paper": None, "Project": discovered["project"]}
|
1181 |
+
paper = infer_paper_from_row(temp_row)
|
1182 |
+
if paper and paper != discovered["paper"]:
|
1183 |
+
discovered["paper"] = paper
|
1184 |
+
found_new = True
|
1185 |
+
|
1186 |
+
# Try to find code from paper if we have paper but no code
|
1187 |
+
if discovered["paper"] and not discovered["code"]:
|
1188 |
+
temp_row = {"Paper": discovered["paper"], "Code": None, "Project": discovered["project"]}
|
1189 |
+
code = infer_code_from_row(temp_row)
|
1190 |
+
if code and code != discovered["code"]:
|
1191 |
+
discovered["code"] = code
|
1192 |
+
found_new = True
|
1193 |
+
|
1194 |
+
# Try to find code from project if we have project but no code
|
1195 |
+
if discovered["project"] and not discovered["code"]:
|
1196 |
+
temp_row = {"Project": discovered["project"], "Code": None, "Paper": discovered["paper"]}
|
1197 |
+
code = infer_code_from_row(temp_row)
|
1198 |
+
if code and code != discovered["code"]:
|
1199 |
+
discovered["code"] = code
|
1200 |
+
found_new = True
|
1201 |
+
|
1202 |
+
# Scrape HuggingFace paper page for additional resources
|
1203 |
+
if discovered["paper"] and not discovered["hf_resources"]:
|
1204 |
+
arxiv_id = get_arxiv_id(discovered["paper"])
|
1205 |
+
if "huggingface.co/papers" in discovered["paper"]:
|
1206 |
+
discovered["hf_resources"] = scrape_huggingface_paper_page(discovered["paper"])
|
1207 |
+
found_new = True
|
1208 |
+
elif arxiv_id:
|
1209 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
1210 |
+
discovered["hf_resources"] = scrape_huggingface_paper_page(hf_paper_url)
|
1211 |
+
if discovered["hf_resources"] and any(discovered["hf_resources"].values()):
|
1212 |
+
found_new = True
|
1213 |
+
|
1214 |
+
# Extract additional resources from HF scraping
|
1215 |
+
if discovered["hf_resources"]:
|
1216 |
+
if not discovered["model"] and discovered["hf_resources"]["models"]:
|
1217 |
+
discovered["model"] = discovered["hf_resources"]["models"][0]
|
1218 |
+
found_new = True
|
1219 |
+
if not discovered["dataset"] and discovered["hf_resources"]["datasets"]:
|
1220 |
+
discovered["dataset"] = discovered["hf_resources"]["datasets"][0]
|
1221 |
+
found_new = True
|
1222 |
+
if not discovered["space"] and discovered["hf_resources"]["spaces"]:
|
1223 |
+
discovered["space"] = discovered["hf_resources"]["spaces"][0]
|
1224 |
+
found_new = True
|
1225 |
+
if not discovered["code"] and discovered["hf_resources"]["code"]:
|
1226 |
+
discovered["code"] = discovered["hf_resources"]["code"][0]
|
1227 |
+
found_new = True
|
1228 |
+
|
1229 |
+
if not found_new:
|
1230 |
+
break
|
1231 |
+
|
1232 |
+
return discovered
|
1233 |
+
|
1234 |
+
|
1235 |
@rate_limit("mcp_tools")
|
1236 |
def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
1237 |
"""
|
1238 |
Find ALL related research resources across platforms for comprehensive analysis.
|
1239 |
+
Uses a multi-round discovery approach to build a complete resource graph.
|
1240 |
|
1241 |
This is a comprehensive tool that combines all individual inference tools to provide
|
1242 |
a complete picture of a research project's ecosystem. It discovers:
|
|
|
1275 |
"total_inferences": 10
|
1276 |
}
|
1277 |
|
1278 |
+
# Phase 1: Discover all URLs by building complete resource graph
|
1279 |
+
discovered_urls = discover_all_urls(cleaned_input)
|
1280 |
+
|
1281 |
+
# Phase 2: Create comprehensive row data with all discovered URLs
|
1282 |
+
complete_row_data = {
|
1283 |
+
"Name": None,
|
1284 |
+
"Authors": [],
|
1285 |
+
"Paper": discovered_urls["paper"],
|
1286 |
+
"Code": discovered_urls["code"],
|
1287 |
+
"Project": discovered_urls["project"],
|
1288 |
+
"Space": discovered_urls["space"],
|
1289 |
+
"Model": discovered_urls["model"],
|
1290 |
+
"Dataset": discovered_urls["dataset"],
|
1291 |
+
"Orgs": [],
|
1292 |
+
"License": None,
|
1293 |
+
"Date": None,
|
1294 |
+
}
|
1295 |
+
|
1296 |
+
# Phase 3: Perform all inferences using complete information
|
1297 |
+
# Paper
|
1298 |
+
if complete_row_data["Paper"]:
|
1299 |
+
relationships["paper"] = complete_row_data["Paper"]
|
1300 |
relationships["success_count"] += 1
|
1301 |
+
|
1302 |
+
# Code
|
1303 |
+
if complete_row_data["Code"]:
|
1304 |
+
relationships["code"] = complete_row_data["Code"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1305 |
relationships["success_count"] += 1
|
|
|
1306 |
|
1307 |
+
# Name inference (try all available sources)
|
1308 |
+
name = infer_name_from_row(complete_row_data)
|
1309 |
if name:
|
1310 |
relationships["name"] = name
|
1311 |
relationships["success_count"] += 1
|
1312 |
|
1313 |
# Authors inference
|
1314 |
+
authors = infer_authors_from_row(complete_row_data)
|
1315 |
if authors:
|
1316 |
relationships["authors"] = authors
|
1317 |
relationships["success_count"] += 1
|
1318 |
|
1319 |
# Date inference
|
1320 |
+
date = infer_date_from_row(complete_row_data)
|
1321 |
if date:
|
1322 |
relationships["date"] = date
|
1323 |
relationships["success_count"] += 1
|
1324 |
|
1325 |
+
# Model
|
1326 |
+
if complete_row_data["Model"]:
|
1327 |
+
relationships["model"] = complete_row_data["Model"]
|
|
|
|
|
|
|
|
|
|
|
1328 |
relationships["success_count"] += 1
|
1329 |
|
1330 |
+
# Dataset
|
1331 |
+
if complete_row_data["Dataset"]:
|
1332 |
+
relationships["dataset"] = complete_row_data["Dataset"]
|
|
|
|
|
|
|
|
|
|
|
1333 |
relationships["success_count"] += 1
|
1334 |
|
1335 |
+
# Space
|
1336 |
+
if complete_row_data["Space"]:
|
1337 |
+
relationships["space"] = complete_row_data["Space"]
|
|
|
|
|
|
|
|
|
|
|
1338 |
relationships["success_count"] += 1
|
1339 |
|
1340 |
# License inference
|
1341 |
+
license_info = infer_license_from_row(complete_row_data)
|
1342 |
if license_info:
|
1343 |
relationships["license"] = license_info
|
1344 |
relationships["success_count"] += 1
|