|
|
|
import gradio as gr |
|
import pandas as pd |
|
import requests |
|
import io |
|
import warnings |
|
import traceback |
|
import json |
|
import tempfile |
|
import os |
|
import logging |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
logging.getLogger("absl").setLevel(logging.ERROR) |
|
os.environ["ABSL_LOG_LEVEL"] = "2" |
|
|
|
|
|
try: |
|
import dask.dataframe as dd |
|
DASK_AVAILABLE = True |
|
except ImportError: |
|
DASK_AVAILABLE = False |
|
|
|
try: |
|
from datasets import load_dataset, Image |
|
DATASETS_AVAILABLE = True |
|
except ImportError: |
|
DATASETS_AVAILABLE = False |
|
|
|
try: |
|
from mlcroissant import Dataset as CroissantDataset |
|
CROISSANT_AVAILABLE = True |
|
except ImportError: |
|
CROISSANT_AVAILABLE = False |
|
|
|
try: |
|
from huggingface_hub import get_token |
|
HF_HUB_AVAILABLE = True |
|
except ImportError: |
|
HF_HUB_AVAILABLE = False |
|
|
|
try: |
|
import polars as pl |
|
POLARS_AVAILABLE = True |
|
except ImportError: |
|
POLARS_AVAILABLE = False |
|
|
|
|
|
DATASET_CONFIG = { |
|
"caselaw": { |
|
"name": "common-pile/caselaw_access_project", "emoji": "βοΈ", |
|
"methods": ["π¨ API (requests)"], "is_public": True, |
|
}, |
|
"prompts": { |
|
"name": "fka/awesome-chatgpt-prompts", "emoji": "π€", |
|
"methods": ["πΌ Pandas", "π¨ API (requests)"], "is_public": True, |
|
}, |
|
"finance": { |
|
"name": "snorkelai/agent-finance-reasoning", "emoji": "π°", |
|
"methods": ["πΌ Pandas", "π¨ API (requests)"], "is_public": False, |
|
}, |
|
"medical": { |
|
"name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "π©Ί", |
|
"methods": ["πΌ Pandas"], "is_public": False, |
|
}, |
|
"inscene": { |
|
"name": "peteromallet/InScene-Dataset", "emoji": "πΌοΈ", |
|
"methods": ["π€ Datasets", "πΌοΈ Datasets with Images"], "is_public": False, |
|
}, |
|
} |
|
|
|
|
|
|
|
def get_auth_headers(): |
|
"""π Get authentication headers if available""" |
|
if not HF_HUB_AVAILABLE: |
|
return {} |
|
try: |
|
token = get_token() |
|
return {"Authorization": f"Bearer {token}"} if token else {} |
|
except Exception: |
|
return {} |
|
|
|
|
|
def dataframe_to_outputs(df: pd.DataFrame): |
|
""" |
|
π Takes a DataFrame and transforms it into various formats. |
|
Now uses temporary files for maximum Gradio compatibility. |
|
""" |
|
if df.empty: |
|
return "No results found. π€·", None, None, "No results to copy." |
|
|
|
df_str = df.astype(str) |
|
markdown_output = df_str.to_markdown(index=False) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv: |
|
df.to_csv(tmp_csv.name, index=False) |
|
csv_path = tmp_csv.name |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx: |
|
df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl') |
|
xlsx_path = tmp_xlsx.name |
|
|
|
tab_delimited_output = df.to_csv(sep='\t', index=False) |
|
|
|
return ( |
|
markdown_output, |
|
csv_path, |
|
xlsx_path, |
|
tab_delimited_output, |
|
) |
|
|
|
def handle_error(e: Exception, request=None, response=None): |
|
""" |
|
π± Oh no! An error! This function now creates a detailed debug log. |
|
""" |
|
error_message = f"π¨ An error occurred: {str(e)}\n" |
|
auth_tip = "π For gated datasets, did you log in? Try `huggingface-cli login` in your terminal." |
|
full_trace = traceback.format_exc() |
|
print(full_trace) |
|
if "401" in str(e) or "Gated" in str(e): |
|
error_message += auth_tip |
|
|
|
debug_log = f"""--- π DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n""" |
|
if request: |
|
debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n""" |
|
if response is not None: |
|
try: |
|
response_text = json.dumps(response.json(), indent=2) |
|
except json.JSONDecodeError: |
|
response_text = response.text |
|
debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n""" |
|
|
|
return ( |
|
pd.DataFrame(), gr.Gallery(None), f"### π¨ Error\nAn error occurred. See the debug log below for details.", |
|
"", None, None, "", f"```python\n# π¨ Error during execution:\n# {e}\n```", |
|
gr.Code(value=debug_log, visible=True) |
|
) |
|
|
|
def search_dataframe(df: pd.DataFrame, query: str): |
|
if not query: |
|
return df.head(100) |
|
string_cols = df.select_dtypes(include=['object', 'string']).columns |
|
if string_cols.empty: |
|
return pd.DataFrame() |
|
mask = pd.Series([False] * len(df)) |
|
for col in string_cols: |
|
mask |= df[col].astype(str).str.contains(query, case=False, na=False) |
|
return df[mask] |
|
|
|
def generate_code_snippet(dataset_key: str, access_method: str, query: str): |
|
""" |
|
π» Generate Python code snippet for the current operation |
|
""" |
|
config = DATASET_CONFIG[dataset_key] |
|
repo_id = config["name"] |
|
|
|
if "API" in access_method: |
|
return f'''# π API Access for {repo_id} |
|
import requests |
|
import pandas as pd |
|
|
|
url = "https://datasets-server.huggingface.co/rows" |
|
params = {{ |
|
"dataset": "{repo_id}", |
|
"config": "default", |
|
"split": "train", |
|
"offset": 0, |
|
"length": 100 |
|
}} |
|
|
|
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}} |
|
response = requests.get(url, params=params, headers=headers) |
|
|
|
if response.status_code == 200: |
|
data = response.json() |
|
rows_data = [item['row'] for item in data['rows']] |
|
df = pd.json_normalize(rows_data) |
|
|
|
# Search for: "{query}" |
|
if "{query}": |
|
string_cols = df.select_dtypes(include=['object', 'string']).columns |
|
mask = pd.Series([False] * len(df)) |
|
for col in string_cols: |
|
mask |= df[col].astype(str).str.contains("{query}", case=False, na=False) |
|
df = df[mask] |
|
|
|
print(f"Found {{len(df)}} results") |
|
print(df.head()) |
|
else: |
|
print(f"Error: {{response.status_code}} - {{response.text}}") |
|
''' |
|
|
|
elif "Pandas" in access_method: |
|
file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet" |
|
read_function = "read_csv" if "csv" in file_path else "read_parquet" |
|
|
|
return f'''# πΌ Pandas Access for {repo_id} |
|
import pandas as pd |
|
|
|
# You may need: huggingface-cli login |
|
df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}") |
|
|
|
# Search for: "{query}" |
|
if "{query}": |
|
string_cols = df.select_dtypes(include=['object', 'string']).columns |
|
mask = pd.Series([False] * len(df)) |
|
for col in string_cols: |
|
mask |= df[col].astype(str).str.contains("{query}", case=False, na=False) |
|
df = df[mask] |
|
|
|
print(f"Found {{len(df)}} results") |
|
print(df.head()) |
|
''' |
|
|
|
elif "Datasets" in access_method: |
|
if "Images" in access_method: |
|
return f'''# πΌοΈ Datasets Library with Image Access for {repo_id} |
|
from datasets import load_dataset |
|
import pandas as pd |
|
|
|
# You may need: huggingface-cli login |
|
ds = load_dataset("{repo_id}", split="train", streaming=True) |
|
data = list(ds.take(50)) # Smaller sample for images |
|
df = pd.DataFrame(data) |
|
|
|
# Process images |
|
images = [] |
|
for item in data: |
|
if 'image' in item and item['image'] is not None: |
|
images.append((item['image'], item.get('text', ''))) |
|
|
|
print(f"Found {{len(df)}} records with {{len(images)}} images") |
|
print(df.head()) |
|
|
|
# Display first image |
|
if images: |
|
first_image, caption = images[0] |
|
first_image.show() # If PIL Image |
|
print(f"Caption: {{caption}}") |
|
''' |
|
else: |
|
return f'''# π€ Datasets Library Access for {repo_id} |
|
from datasets import load_dataset |
|
import pandas as pd |
|
|
|
# You may need: huggingface-cli login |
|
ds = load_dataset("{repo_id}", split="train", streaming=True) |
|
data = list(ds.take(1000)) |
|
df = pd.DataFrame(data) |
|
|
|
# Search for: "{query}" |
|
if "{query}": |
|
string_cols = df.select_dtypes(include=['object', 'string']).columns |
|
mask = pd.Series([False] * len(df)) |
|
for col in string_cols: |
|
mask |= df[col].astype(str).str.contains("{query}", case=False, na=False) |
|
df = df[mask] |
|
|
|
print(f"Found {{len(df)}} results") |
|
print(df.head()) |
|
''' |
|
|
|
else: |
|
return f"# Code generation for {access_method} not implemented yet" |
|
|
|
|
|
def fetch_data(dataset_key: str, access_method: str, query: str): |
|
""" |
|
π Main mission control. Always yields a tuple of 9 values to match the UI components. |
|
""" |
|
outputs = [pd.DataFrame(), None, "π Ready.", "", None, None, "", "", gr.Code(visible=False)] |
|
req, res = None, None |
|
try: |
|
config = DATASET_CONFIG[dataset_key] |
|
repo_id = config["name"] |
|
|
|
|
|
code_snippet = generate_code_snippet(dataset_key, access_method, query) |
|
outputs[7] = code_snippet |
|
|
|
if "API" in access_method: |
|
all_results_df = pd.DataFrame() |
|
MAX_PAGES = 5 |
|
PAGE_SIZE = 100 |
|
|
|
if not query: |
|
MAX_PAGES = 1 |
|
outputs[2] = "β³ No search term. Fetching first 100 records as a sample..." |
|
yield tuple(outputs) |
|
|
|
for page in range(MAX_PAGES): |
|
if query: |
|
outputs[2] = f"β³ Searching page {page + 1}..." |
|
yield tuple(outputs) |
|
|
|
offset = page * PAGE_SIZE |
|
url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}" |
|
headers = get_auth_headers() if not config["is_public"] else {} |
|
|
|
res = requests.get(url, headers=headers) |
|
req = res.request |
|
res.raise_for_status() |
|
data = res.json() |
|
|
|
if not data.get('rows'): |
|
outputs[2] = "π No more data to search." |
|
yield tuple(outputs) |
|
break |
|
|
|
|
|
|
|
rows_data = [item['row'] for item in data['rows']] |
|
page_df = pd.json_normalize(rows_data) |
|
|
|
found_in_page = search_dataframe(page_df, query) |
|
|
|
if not found_in_page.empty: |
|
all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True) |
|
outputs[0] = all_results_df |
|
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df) |
|
outputs[2] = f"β
Found **{len(all_results_df)}** results so far..." |
|
|
|
if dataset_key == 'inscene': |
|
try: |
|
gallery_data = [] |
|
for _, row in all_results_df.iterrows(): |
|
if 'image' in row: |
|
image_data = row.get('image') |
|
text_data = row.get('text', '') |
|
|
|
|
|
if hasattr(image_data, 'save'): |
|
gallery_data.append((image_data, text_data)) |
|
elif isinstance(image_data, str): |
|
gallery_data.append((image_data, text_data)) |
|
|
|
if gallery_data: |
|
outputs[1] = gr.Gallery(gallery_data, label="πΌοΈ Image Results", height=400) |
|
except Exception as img_error: |
|
|
|
pass |
|
yield tuple(outputs) |
|
|
|
outputs[2] = f"π Search complete. Found a total of **{len(all_results_df)}** results." |
|
yield tuple(outputs) |
|
return |
|
|
|
outputs[2] = f"β³ Loading data via `{access_method}`..." |
|
yield tuple(outputs) |
|
|
|
df = pd.DataFrame() |
|
|
|
if "Pandas" in access_method: |
|
file_path = f"hf://datasets/{repo_id}/" |
|
if repo_id == "fka/awesome-chatgpt-prompts": |
|
file_path += "prompts.csv" |
|
df = pd.read_csv(file_path) |
|
else: |
|
try: |
|
df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet") |
|
except: |
|
try: |
|
df = pd.read_parquet(f"{file_path}train.parquet") |
|
except: |
|
df = pd.read_json(f"{file_path}medical_o1_sft.json") |
|
|
|
elif "Datasets" in access_method: |
|
if not DATASETS_AVAILABLE: |
|
raise ImportError("datasets library not available. Install with: pip install datasets") |
|
|
|
|
|
if dataset_key == 'inscene' and "Images" in access_method: |
|
outputs[2] = "πΌοΈ Loading InScene dataset with image processing..." |
|
yield tuple(outputs) |
|
|
|
|
|
ds = load_dataset(repo_id, split='train', streaming=True) |
|
data_list = list(ds.take(50)) |
|
df = pd.DataFrame(data_list) |
|
|
|
|
|
gallery_data = [] |
|
for i, item in enumerate(data_list): |
|
try: |
|
if 'image' in item and item['image'] is not None: |
|
image = item['image'] |
|
caption = item.get('text', f'Image {i+1}') |
|
|
|
|
|
if hasattr(image, 'save'): |
|
gallery_data.append((image, caption)) |
|
elif isinstance(image, str): |
|
gallery_data.append((image, caption)) |
|
|
|
|
|
if len(gallery_data) >= 20: |
|
break |
|
|
|
except Exception as img_error: |
|
continue |
|
|
|
|
|
if gallery_data: |
|
outputs[1] = gr.Gallery(gallery_data, label=f"πΌοΈ Found {len(gallery_data)} Images", height=400, columns=4, rows=2) |
|
outputs[2] = f"πΌοΈ Loaded {len(df)} records with {len(gallery_data)} images" |
|
else: |
|
outputs[2] = "πΌοΈ Loaded data but no images found to display" |
|
|
|
else: |
|
|
|
ds = load_dataset(repo_id, split='train', streaming=True) |
|
data_list = list(ds.take(1000)) |
|
df = pd.DataFrame(data_list) |
|
outputs[2] = f"π Loaded {len(df)} records via Datasets library" |
|
|
|
outputs[2] = "π Searching loaded data..." |
|
yield tuple(outputs) |
|
|
|
final_df = search_dataframe(df, query) |
|
|
|
outputs[0] = final_df |
|
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df) |
|
outputs[2] = f"π Search complete. Found **{len(final_df)}** results." |
|
|
|
if dataset_key == 'inscene' and not final_df.empty: |
|
|
|
try: |
|
gallery_data = [] |
|
for _, row in final_df.iterrows(): |
|
if 'image' in row: |
|
image_data = row.get('image') |
|
text_data = row.get('text', '') |
|
|
|
|
|
if hasattr(image_data, 'save'): |
|
gallery_data.append((image_data, text_data)) |
|
elif isinstance(image_data, str): |
|
gallery_data.append((image_data, text_data)) |
|
|
|
if gallery_data: |
|
outputs[1] = gr.Gallery(gallery_data, label="πΌοΈ Image Results", height=400) |
|
except Exception as img_error: |
|
outputs[2] += f"\nβ οΈ Image display error: {str(img_error)}" |
|
|
|
yield tuple(outputs) |
|
|
|
except Exception as e: |
|
yield handle_error(e, req, res) |
|
|
|
|
|
|
|
def create_dataset_tab(dataset_key: str): |
|
config = DATASET_CONFIG[dataset_key] |
|
|
|
with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"): |
|
gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset") |
|
if not config['is_public']: |
|
gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.") |
|
|
|
|
|
available_methods = config['methods'] |
|
methods_note = f"**Available methods:** {len(available_methods)} tested and working methods" |
|
if dataset_key == 'inscene': |
|
methods_note += " (πΌοΈ = Image viewer included)" |
|
gr.Markdown(methods_note) |
|
|
|
with gr.Row(): |
|
access_method = gr.Radio( |
|
available_methods, |
|
label="π Access Method", |
|
value=available_methods[0] if available_methods else "π¨ API (requests)" |
|
) |
|
query = gr.Textbox( |
|
label="π Search Query", |
|
placeholder="Enter any text to search, or leave blank for samples..." |
|
) |
|
|
|
fetch_button = gr.Button("π Go Fetch!") |
|
status_output = gr.Markdown("π Ready to search.") |
|
df_output = gr.DataFrame(label="π Results DataFrame", interactive=False, wrap=True) |
|
|
|
|
|
show_gallery = (dataset_key == 'inscene') |
|
gallery_output = gr.Gallery(visible=show_gallery, label="πΌοΈ Image Results", height=400, columns=4, rows=2) |
|
|
|
with gr.Accordion("π View/Export Full Results", open=False): |
|
markdown_output = gr.Markdown(label="π Markdown View") |
|
with gr.Row(): |
|
csv_output = gr.File(label="β¬οΈ Download CSV") |
|
xlsx_output = gr.File(label="β¬οΈ Download XLSX") |
|
copy_output = gr.Code(label="π Copy-Paste (Tab-Delimited)") |
|
|
|
code_output = gr.Code(label="π» Python Code Snippet", language="python") |
|
|
|
debug_log_output = gr.Code(label="π Debug Log", visible=False) |
|
|
|
fetch_button.click( |
|
fn=fetch_data, |
|
inputs=[gr.State(dataset_key), access_method, query], |
|
outputs=[ |
|
df_output, gallery_output, status_output, markdown_output, |
|
csv_output, xlsx_output, copy_output, code_output, |
|
debug_log_output |
|
] |
|
) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo: |
|
gr.Markdown("# π€ Hugging Face Dataset Explorer") |
|
gr.Markdown( |
|
"Select a dataset, choose an access method, and type a query. " |
|
"If an error occurs, a detailed debug log will appear to help troubleshoot the issue." |
|
) |
|
|
|
|
|
def get_dependency_status(): |
|
status = "### π§ Dataset-Specific Methods (Only Working Methods Shown):\n" |
|
for key, config in DATASET_CONFIG.items(): |
|
methods_str = ", ".join(config['methods']) |
|
auth_status = "π Requires Auth" if not config['is_public'] else "β
Public" |
|
status += f"- **{config['emoji']} {key.capitalize()}**: {methods_str} ({auth_status})\n" |
|
|
|
status += "\n### π Library Dependencies:\n" |
|
status += f"- **πΌ Pandas**: β
Available\n" |
|
status += f"- **π¨ Requests**: β
Available\n" |
|
status += f"- **π€ Datasets**: {'β
Available' if DATASETS_AVAILABLE else 'β Not installed'}\n" |
|
|
|
return status |
|
|
|
with gr.Accordion("π§ Library Status & Quick Start Guide", open=False): |
|
gr.Markdown(get_dependency_status()) |
|
gr.Markdown(""" |
|
### π Quick Start Guide: |
|
1. **π€ Prompts**: Try Pandas or API method, search for "translator", "linux", or "writer" |
|
2. **βοΈ Caselaw**: Try API method only, search for "contract", "court", or "appeal" |
|
3. **π° Finance**: Try Pandas or API method (requires auth), search for "interest" or "market" |
|
4. **π©Ί Medical**: Try Pandas method only (requires auth), search for "diagnosis" or "treatment" |
|
5. **πΌοΈ InScene**: Try "πΌοΈ Datasets with Images" to see actual images, search for "kitchen" or "outdoor" |
|
|
|
### π Authentication: |
|
For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login` |
|
|
|
### π οΈ Method Explanations: |
|
- **π¨ API**: Fast, reliable, works without login (100 rows max) |
|
- **πΌ Pandas**: Full dataset access, requires login for gated datasets |
|
- **π€ Datasets**: Standard HuggingFace datasets library |
|
- **πΌοΈ Datasets with Images**: Special image viewer for InScene dataset |
|
|
|
### β οΈ Note: |
|
Only working methods are shown for each dataset. Non-functional methods have been removed. |
|
""") |
|
|
|
if not DATASETS_AVAILABLE: |
|
gr.Markdown("**β οΈ Install datasets library for image viewing:** `pip install datasets`") |
|
|
|
with gr.Tabs(): |
|
for key in DATASET_CONFIG.keys(): |
|
create_dataset_tab(key) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |