sumuks HF Staff commited on
Commit
a94ffeb
·
verified ·
1 Parent(s): f08266c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -25
app.py CHANGED
@@ -34,6 +34,9 @@ def filterfunc(x: dict) -> bool:
34
  if len(x.get("text", "").split()) < 100:
35
  return False
36
 
 
 
 
37
  excluded = {"Promotional/Advertisement", "Machine-Generated", "Images/Videos/Audio",
38
  "Truncated", "Spam/Ads", "Product Page", "Content Listing"}
39
 
@@ -42,31 +45,6 @@ def filterfunc(x: dict) -> bool:
42
  if label := x.get("eai_taxonomy", {}).get(version, {}).get(level, {}).get("label"):
43
  if label in excluded:
44
  return False
45
-
46
- # FDC pairing: Medicine (61) with another science code - widened scope
47
- dds_primary = x.get("eai_taxonomy", {}).get("dds", {}).get("primary", {}).get("label", "")
48
- dds_secondary = x.get("eai_taxonomy", {}).get("dds", {}).get("secondary", {}).get("label", "")
49
-
50
- # Debug: Log some DDS codes to see what's available
51
- if dds_primary or dds_secondary:
52
- logger.debug(f"DDS codes found - Primary: {dds_primary}, Secondary: {dds_secondary}")
53
-
54
- # Widened FDC filter: Accept if ANY DDS code starts with medical/science prefixes
55
- fdc_paired = (
56
- # Original strict pairing
57
- (prefix(dds_primary) in FDC_KEEP and prefix(dds_secondary) in SCIENCE_CODES) or
58
- (prefix(dds_secondary) in FDC_KEEP and prefix(dds_primary) in SCIENCE_CODES) or
59
- # Widened: Accept if primary OR secondary has medical code
60
- prefix(dds_primary) in FDC_KEEP or
61
- prefix(dds_secondary) in FDC_KEEP or
62
- # Even wider: Accept if ANY DDS code is in science codes
63
- prefix(dds_primary) in SCIENCE_CODES or
64
- prefix(dds_secondary) in SCIENCE_CODES
65
- )
66
-
67
- if not fdc_paired:
68
- return False
69
-
70
  return True
71
 
72
 
 
34
  if len(x.get("text", "").split()) < 100:
35
  return False
36
 
37
+ if x.get("eai_taxonomy", {}).get("free_decimal_correspondence", {}).get("primary", {}).get("code", "")[:2] != "61":
38
+ return False
39
+
40
  excluded = {"Promotional/Advertisement", "Machine-Generated", "Images/Videos/Audio",
41
  "Truncated", "Spam/Ads", "Product Page", "Content Listing"}
42
 
 
45
  if label := x.get("eai_taxonomy", {}).get(version, {}).get(level, {}).get("label"):
46
  if label in excluded:
47
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  return True
49
 
50