awacke1 commited on
Commit
3537d14
Β·
verified Β·
1 Parent(s): 55c99d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -102
app.py CHANGED
@@ -47,41 +47,26 @@ except ImportError:
47
  POLARS_AVAILABLE = False
48
 
49
  # --- βš™οΈ Configuration & Constants ---
50
- def get_available_methods():
51
- """πŸ”§ Get available methods based on installed dependencies"""
52
- base_methods = ["πŸ’¨ API (requests)", "🐼 Pandas"]
53
-
54
- if DATASETS_AVAILABLE:
55
- base_methods.append("πŸ€— Datasets")
56
- if POLARS_AVAILABLE:
57
- base_methods.append("🧊 Polars")
58
- if DASK_AVAILABLE:
59
- base_methods.append("🧊 Dask")
60
- if CROISSANT_AVAILABLE:
61
- base_methods.append("πŸ₯ Croissant")
62
-
63
- return base_methods
64
-
65
  DATASET_CONFIG = {
66
  "caselaw": {
67
  "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
68
- "methods": get_available_methods(), "is_public": True,
69
  },
70
  "prompts": {
71
  "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
72
- "methods": get_available_methods(), "is_public": True,
73
  },
74
  "finance": {
75
  "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
76
- "methods": get_available_methods(), "is_public": False,
77
  },
78
  "medical": {
79
  "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
80
- "methods": get_available_methods(), "is_public": False,
81
  },
82
  "inscene": {
83
  "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
84
- "methods": get_available_methods(), "is_public": False,
85
  },
86
  }
87
 
@@ -211,11 +196,13 @@ else:
211
 
212
  elif "Pandas" in access_method:
213
  file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
 
 
214
  return f'''# 🐼 Pandas Access for {repo_id}
215
  import pandas as pd
216
 
217
  # You may need: huggingface-cli login
218
- df = pd.read_{"csv" if "csv" in file_path else "parquet"}("hf://datasets/{repo_id}/{file_path}")
219
 
220
  # Search for: "{query}"
221
  if "{query}":
@@ -230,7 +217,33 @@ print(df.head())
230
  '''
231
 
232
  elif "Datasets" in access_method:
233
- return f'''# πŸ€— Datasets Library Access for {repo_id}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  from datasets import load_dataset
235
  import pandas as pd
236
 
@@ -358,65 +371,51 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
358
  elif "Datasets" in access_method:
359
  if not DATASETS_AVAILABLE:
360
  raise ImportError("datasets library not available. Install with: pip install datasets")
361
- ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
362
- df = pd.DataFrame(ds)
363
 
364
- elif "Polars" in access_method:
365
- if not POLARS_AVAILABLE:
366
- raise ImportError("polars library not available. Install with: pip install polars")
367
- outputs[2] = "⏳ Loading with Polars..."
368
- yield tuple(outputs)
369
- if repo_id == "fka/awesome-chatgpt-prompts":
370
- pl_df = pl.read_csv(f"hf://datasets/{repo_id}/prompts.csv")
371
- else:
372
- pl_df = pl.read_parquet(f"hf://datasets/{repo_id}/train.parquet")
373
- df = pl_df.to_pandas()
374
-
375
- elif "Dask" in access_method:
376
- if not DASK_AVAILABLE:
377
- raise ImportError("dask library not available. Install with: pip install dask")
378
- outputs[2] = "⏳ Loading with Dask..."
379
- yield tuple(outputs)
380
- dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
381
- df = dask_df.head(1000) # Convert to pandas for processing
382
-
383
- elif "Croissant" in access_method:
384
- if not CROISSANT_AVAILABLE:
385
- raise ImportError("mlcroissant library not available. Install with: pip install mlcroissant")
386
- outputs[2] = "⏳ Loading with Croissant..."
387
- yield tuple(outputs)
388
-
389
- try:
390
- headers = get_auth_headers() if not config["is_public"] else {}
391
- croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
392
- response = requests.get(croissant_url, headers=headers)
393
- response.raise_for_status()
394
- jsonld = response.json()
395
-
396
- # Suppress MLCroissant warnings during dataset creation
397
- with warnings.catch_warnings():
398
- warnings.simplefilter("ignore")
399
- ds = CroissantDataset(jsonld=jsonld)
400
- records = list(ds.records("default"))[:1000] # Take first 1000
401
- df = pd.DataFrame(records)
402
-
403
- except Exception as croissant_error:
404
- # If Croissant fails, fall back to API method
405
- outputs[2] = f"⚠️ Croissant method failed, falling back to API method..."
406
  yield tuple(outputs)
407
 
408
- # Retry with API method
409
- url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
410
- headers = get_auth_headers() if not config["is_public"] else {}
411
- response = requests.get(url, headers=headers)
412
- response.raise_for_status()
413
- data = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
- if data.get('rows'):
416
- rows_data = [item['row'] for item in data['rows']]
417
- df = pd.json_normalize(rows_data)
 
418
  else:
419
- raise Exception("No data available from fallback API method")
 
 
 
 
 
 
 
420
 
421
  outputs[2] = "πŸ” Searching loaded data..."
422
  yield tuple(outputs)
@@ -464,8 +463,10 @@ def create_dataset_tab(dataset_key: str):
464
 
465
  # Show available methods for this dataset
466
  available_methods = config['methods']
467
- if len(available_methods) < 5: # Some methods missing
468
- gr.Markdown(f"**Available methods:** {len(available_methods)} of 6 possible methods")
 
 
469
 
470
  with gr.Row():
471
  access_method = gr.Radio(
@@ -481,7 +482,10 @@ def create_dataset_tab(dataset_key: str):
481
  fetch_button = gr.Button("πŸš€ Go Fetch!")
482
  status_output = gr.Markdown("🏁 Ready to search.")
483
  df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
484
- gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πŸ–ΌοΈ Image Results")
 
 
 
485
 
486
  with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
487
  markdown_output = gr.Markdown(label="πŸ“ Markdown View")
@@ -512,44 +516,46 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as
512
  "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
513
  )
514
 
515
- # Show dependency status
516
  def get_dependency_status():
517
- status = "### πŸ”§ Available Libraries:\n"
518
- status += f"- **πŸ’¨ API**: βœ… Always available\n"
 
 
 
 
 
519
  status += f"- **🐼 Pandas**: βœ… Available\n"
 
520
  status += f"- **πŸ€— Datasets**: {'βœ… Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
521
- status += f"- **🧊 Polars**: {'βœ… Available' if POLARS_AVAILABLE else '❌ Not installed'}\n"
522
- status += f"- **🧊 Dask**: {'βœ… Available' if DASK_AVAILABLE else '❌ Not installed'}\n"
523
- status += f"- **πŸ₯ Croissant**: {'βœ… Available' if CROISSANT_AVAILABLE else '❌ Not installed'}\n"
524
- status += f"- **πŸ”‘ HF Authentication**: {'βœ… Available' if HF_HUB_AVAILABLE else '❌ Not installed'}\n"
525
  return status
526
 
527
  with gr.Accordion("πŸ”§ Library Status & Quick Start Guide", open=False):
528
  gr.Markdown(get_dependency_status())
529
  gr.Markdown("""
530
- ### πŸš€ Quick Start:
531
- 1. **πŸ€– Prompts Tab**: Try API method, search for "translator" or "linux"
532
- 2. **βš–οΈ Caselaw Tab**: Try API method, search for "contract" or "court"
533
- 3. **πŸ’° Finance Tab**: Requires login, try API method first
534
- 4. **🩺 Medical Tab**: Requires login, try API method first
535
- 5. **πŸ–ΌοΈ InScene Tab**: Requires login, try Datasets method for images
536
 
537
  ### πŸ”‘ Authentication:
538
- For gated datasets, run in terminal: `huggingface-cli login`
539
 
540
- ### πŸ› οΈ Methods:
541
  - **πŸ’¨ API**: Fast, reliable, works without login (100 rows max)
542
  - **🐼 Pandas**: Full dataset access, requires login for gated datasets
543
- - **πŸ€— Datasets**: Good for streaming large datasets
544
- - **🧊 Polars/Dask**: Alternative fast data processing
545
- - **πŸ₯ Croissant**: Metadata-aware loading (has fallback to API)
546
 
547
- ### πŸ“¦ Missing Libraries:
548
- If methods are missing, install with:
549
- ```bash
550
- pip install datasets polars dask mlcroissant GitPython
551
- ```
552
  """)
 
 
 
553
 
554
  with gr.Tabs():
555
  for key in DATASET_CONFIG.keys():
 
47
  POLARS_AVAILABLE = False
48
 
49
  # --- βš™οΈ Configuration & Constants ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  DATASET_CONFIG = {
51
  "caselaw": {
52
  "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
53
+ "methods": ["πŸ’¨ API (requests)"], "is_public": True,
54
  },
55
  "prompts": {
56
  "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
57
+ "methods": ["🐼 Pandas", "πŸ’¨ API (requests)"], "is_public": True,
58
  },
59
  "finance": {
60
  "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
61
+ "methods": ["🐼 Pandas", "πŸ’¨ API (requests)"], "is_public": False,
62
  },
63
  "medical": {
64
  "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
65
+ "methods": ["🐼 Pandas"], "is_public": False,
66
  },
67
  "inscene": {
68
  "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
69
+ "methods": ["πŸ€— Datasets", "πŸ–ΌοΈ Datasets with Images"], "is_public": False,
70
  },
71
  }
72
 
 
196
 
197
  elif "Pandas" in access_method:
198
  file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
199
+ read_function = "read_csv" if "csv" in file_path else "read_parquet"
200
+
201
  return f'''# 🐼 Pandas Access for {repo_id}
202
  import pandas as pd
203
 
204
  # You may need: huggingface-cli login
205
+ df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}")
206
 
207
  # Search for: "{query}"
208
  if "{query}":
 
217
  '''
218
 
219
  elif "Datasets" in access_method:
220
+ if "Images" in access_method:
221
+ return f'''# πŸ–ΌοΈ Datasets Library with Image Access for {repo_id}
222
+ from datasets import load_dataset
223
+ import pandas as pd
224
+
225
+ # You may need: huggingface-cli login
226
+ ds = load_dataset("{repo_id}", split="train", streaming=True)
227
+ data = list(ds.take(50)) # Smaller sample for images
228
+ df = pd.DataFrame(data)
229
+
230
+ # Process images
231
+ images = []
232
+ for item in data:
233
+ if 'image' in item and item['image'] is not None:
234
+ images.append((item['image'], item.get('text', '')))
235
+
236
+ print(f"Found {{len(df)}} records with {{len(images)}} images")
237
+ print(df.head())
238
+
239
+ # Display first image
240
+ if images:
241
+ first_image, caption = images[0]
242
+ first_image.show() # If PIL Image
243
+ print(f"Caption: {{caption}}")
244
+ '''
245
+ else:
246
+ return f'''# πŸ€— Datasets Library Access for {repo_id}
247
  from datasets import load_dataset
248
  import pandas as pd
249
 
 
371
  elif "Datasets" in access_method:
372
  if not DATASETS_AVAILABLE:
373
  raise ImportError("datasets library not available. Install with: pip install datasets")
 
 
374
 
375
+ # Special handling for image datasets
376
+ if dataset_key == 'inscene' and "Images" in access_method:
377
+ outputs[2] = "πŸ–ΌοΈ Loading InScene dataset with image processing..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  yield tuple(outputs)
379
 
380
+ # Load with image processing
381
+ ds = load_dataset(repo_id, split='train', streaming=True)
382
+ data_list = list(ds.take(50)) # Smaller sample for images
383
+ df = pd.DataFrame(data_list)
384
+
385
+ # Process images for gallery display
386
+ gallery_data = []
387
+ for i, item in enumerate(data_list):
388
+ try:
389
+ if 'image' in item and item['image'] is not None:
390
+ image = item['image']
391
+ caption = item.get('text', f'Image {i+1}')
392
+
393
+ # Convert PIL Image to displayable format
394
+ if hasattr(image, 'save'):
395
+ gallery_data.append((image, caption))
396
+ elif isinstance(image, str):
397
+ gallery_data.append((image, caption))
398
+
399
+ # Limit to first 20 images for performance
400
+ if len(gallery_data) >= 20:
401
+ break
402
+
403
+ except Exception as img_error:
404
+ continue
405
 
406
+ # Update gallery with images
407
+ if gallery_data:
408
+ outputs[1] = gr.Gallery(gallery_data, label=f"πŸ–ΌοΈ Found {len(gallery_data)} Images", height=400, columns=4, rows=2)
409
+ outputs[2] = f"πŸ–ΌοΈ Loaded {len(df)} records with {len(gallery_data)} images"
410
  else:
411
+ outputs[2] = "πŸ–ΌοΈ Loaded data but no images found to display"
412
+
413
+ else:
414
+ # Regular datasets loading
415
+ ds = load_dataset(repo_id, split='train', streaming=True)
416
+ data_list = list(ds.take(1000))
417
+ df = pd.DataFrame(data_list)
418
+ outputs[2] = f"πŸ“š Loaded {len(df)} records via Datasets library"
419
 
420
  outputs[2] = "πŸ” Searching loaded data..."
421
  yield tuple(outputs)
 
463
 
464
  # Show available methods for this dataset
465
  available_methods = config['methods']
466
+ methods_note = f"**Available methods:** {len(available_methods)} tested and working methods"
467
+ if dataset_key == 'inscene':
468
+ methods_note += " (πŸ–ΌοΈ = Image viewer included)"
469
+ gr.Markdown(methods_note)
470
 
471
  with gr.Row():
472
  access_method = gr.Radio(
 
482
  fetch_button = gr.Button("πŸš€ Go Fetch!")
483
  status_output = gr.Markdown("🏁 Ready to search.")
484
  df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
485
+
486
+ # Show gallery for InScene dataset or when using image methods
487
+ show_gallery = (dataset_key == 'inscene')
488
+ gallery_output = gr.Gallery(visible=show_gallery, label="πŸ–ΌοΈ Image Results", height=400, columns=4, rows=2)
489
 
490
  with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
491
  markdown_output = gr.Markdown(label="πŸ“ Markdown View")
 
516
  "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
517
  )
518
 
519
+ # Show dependency status and dataset-specific methods
520
  def get_dependency_status():
521
+ status = "### πŸ”§ Dataset-Specific Methods (Only Working Methods Shown):\n"
522
+ for key, config in DATASET_CONFIG.items():
523
+ methods_str = ", ".join(config['methods'])
524
+ auth_status = "πŸ” Requires Auth" if not config['is_public'] else "βœ… Public"
525
+ status += f"- **{config['emoji']} {key.capitalize()}**: {methods_str} ({auth_status})\n"
526
+
527
+ status += "\n### πŸ“š Library Dependencies:\n"
528
  status += f"- **🐼 Pandas**: βœ… Available\n"
529
+ status += f"- **πŸ’¨ Requests**: βœ… Available\n"
530
  status += f"- **πŸ€— Datasets**: {'βœ… Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
531
+
 
 
 
532
  return status
533
 
534
  with gr.Accordion("πŸ”§ Library Status & Quick Start Guide", open=False):
535
  gr.Markdown(get_dependency_status())
536
  gr.Markdown("""
537
+ ### πŸš€ Quick Start Guide:
538
+ 1. **πŸ€– Prompts**: Try Pandas or API method, search for "translator", "linux", or "writer"
539
+ 2. **βš–οΈ Caselaw**: Try API method only, search for "contract", "court", or "appeal"
540
+ 3. **πŸ’° Finance**: Try Pandas or API method (requires auth), search for "interest" or "market"
541
+ 4. **🩺 Medical**: Try Pandas method only (requires auth), search for "diagnosis" or "treatment"
542
+ 5. **πŸ–ΌοΈ InScene**: Try "πŸ–ΌοΈ Datasets with Images" to see actual images, search for "kitchen" or "outdoor"
543
 
544
  ### πŸ”‘ Authentication:
545
+ For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login`
546
 
547
+ ### πŸ› οΈ Method Explanations:
548
  - **πŸ’¨ API**: Fast, reliable, works without login (100 rows max)
549
  - **🐼 Pandas**: Full dataset access, requires login for gated datasets
550
+ - **πŸ€— Datasets**: Standard HuggingFace datasets library
551
+ - **πŸ–ΌοΈ Datasets with Images**: Special image viewer for InScene dataset
 
552
 
553
+ ### ⚠️ Note:
554
+ Only working methods are shown for each dataset. Non-functional methods have been removed.
 
 
 
555
  """)
556
+
557
+ if not DATASETS_AVAILABLE:
558
+ gr.Markdown("**⚠️ Install datasets library for image viewing:** `pip install datasets`")
559
 
560
  with gr.Tabs():
561
  for key in DATASET_CONFIG.keys():