awacke1's picture
Update app.py
3537d14 verified
# app.py
import gradio as gr
import pandas as pd
import requests
import io
import warnings
import traceback
import json
import tempfile
import os
import logging
# 🀫 Suppress warnings and set logging levels
warnings.filterwarnings("ignore")
logging.getLogger("absl").setLevel(logging.ERROR) # Suppress MLCroissant warnings
os.environ["ABSL_LOG_LEVEL"] = "2" # Only show errors
# Import optional dependencies with fallbacks
try:
import dask.dataframe as dd
DASK_AVAILABLE = True
except ImportError:
DASK_AVAILABLE = False
try:
from datasets import load_dataset, Image
DATASETS_AVAILABLE = True
except ImportError:
DATASETS_AVAILABLE = False
try:
from mlcroissant import Dataset as CroissantDataset
CROISSANT_AVAILABLE = True
except ImportError:
CROISSANT_AVAILABLE = False
try:
from huggingface_hub import get_token
HF_HUB_AVAILABLE = True
except ImportError:
HF_HUB_AVAILABLE = False
try:
import polars as pl
POLARS_AVAILABLE = True
except ImportError:
POLARS_AVAILABLE = False
# --- βš™οΈ Configuration & Constants ---
DATASET_CONFIG = {
"caselaw": {
"name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
"methods": ["πŸ’¨ API (requests)"], "is_public": True,
},
"prompts": {
"name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
"methods": ["🐼 Pandas", "πŸ’¨ API (requests)"], "is_public": True,
},
"finance": {
"name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
"methods": ["🐼 Pandas", "πŸ’¨ API (requests)"], "is_public": False,
},
"medical": {
"name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
"methods": ["🐼 Pandas"], "is_public": False,
},
"inscene": {
"name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
"methods": ["πŸ€— Datasets", "πŸ–ΌοΈ Datasets with Images"], "is_public": False,
},
}
# --- πŸ”§ Helpers & Utility Functions ---
def get_auth_headers():
"""πŸ”‘ Get authentication headers if available"""
if not HF_HUB_AVAILABLE:
return {}
try:
token = get_token()
return {"Authorization": f"Bearer {token}"} if token else {}
except Exception:
return {}
# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
def dataframe_to_outputs(df: pd.DataFrame):
"""
πŸ“œ Takes a DataFrame and transforms it into various formats.
Now uses temporary files for maximum Gradio compatibility.
"""
if df.empty:
return "No results found. 🀷", None, None, "No results to copy."
df_str = df.astype(str)
markdown_output = df_str.to_markdown(index=False)
# Create a temporary CSV file
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
df.to_csv(tmp_csv.name, index=False)
csv_path = tmp_csv.name
# Create a temporary XLSX file
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
xlsx_path = tmp_xlsx.name
tab_delimited_output = df.to_csv(sep='\t', index=False)
return (
markdown_output,
csv_path,
xlsx_path,
tab_delimited_output,
)
def handle_error(e: Exception, request=None, response=None):
"""
😱 Oh no! An error! This function now creates a detailed debug log.
"""
error_message = f"🚨 An error occurred: {str(e)}\n"
auth_tip = "πŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
full_trace = traceback.format_exc()
print(full_trace)
if "401" in str(e) or "Gated" in str(e):
error_message += auth_tip
debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
if request:
debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
if response is not None:
try:
response_text = json.dumps(response.json(), indent=2)
except json.JSONDecodeError:
response_text = response.text
debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""
return (
pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
"", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
gr.Code(value=debug_log, visible=True)
)
def search_dataframe(df: pd.DataFrame, query: str):
if not query:
return df.head(100)
string_cols = df.select_dtypes(include=['object', 'string']).columns
if string_cols.empty:
return pd.DataFrame()
mask = pd.Series([False] * len(df))
for col in string_cols:
mask |= df[col].astype(str).str.contains(query, case=False, na=False)
return df[mask]
def generate_code_snippet(dataset_key: str, access_method: str, query: str):
"""
πŸ’» Generate Python code snippet for the current operation
"""
config = DATASET_CONFIG[dataset_key]
repo_id = config["name"]
if "API" in access_method:
return f'''# 🌐 API Access for {repo_id}
import requests
import pandas as pd
url = "https://datasets-server.huggingface.co/rows"
params = {{
"dataset": "{repo_id}",
"config": "default",
"split": "train",
"offset": 0,
"length": 100
}}
headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}}
response = requests.get(url, params=params, headers=headers)
if response.status_code == 200:
data = response.json()
rows_data = [item['row'] for item in data['rows']]
df = pd.json_normalize(rows_data)
# Search for: "{query}"
if "{query}":
string_cols = df.select_dtypes(include=['object', 'string']).columns
mask = pd.Series([False] * len(df))
for col in string_cols:
mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
df = df[mask]
print(f"Found {{len(df)}} results")
print(df.head())
else:
print(f"Error: {{response.status_code}} - {{response.text}}")
'''
elif "Pandas" in access_method:
file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
read_function = "read_csv" if "csv" in file_path else "read_parquet"
return f'''# 🐼 Pandas Access for {repo_id}
import pandas as pd
# You may need: huggingface-cli login
df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}")
# Search for: "{query}"
if "{query}":
string_cols = df.select_dtypes(include=['object', 'string']).columns
mask = pd.Series([False] * len(df))
for col in string_cols:
mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
df = df[mask]
print(f"Found {{len(df)}} results")
print(df.head())
'''
elif "Datasets" in access_method:
if "Images" in access_method:
return f'''# πŸ–ΌοΈ Datasets Library with Image Access for {repo_id}
from datasets import load_dataset
import pandas as pd
# You may need: huggingface-cli login
ds = load_dataset("{repo_id}", split="train", streaming=True)
data = list(ds.take(50)) # Smaller sample for images
df = pd.DataFrame(data)
# Process images
images = []
for item in data:
if 'image' in item and item['image'] is not None:
images.append((item['image'], item.get('text', '')))
print(f"Found {{len(df)}} records with {{len(images)}} images")
print(df.head())
# Display first image
if images:
first_image, caption = images[0]
first_image.show() # If PIL Image
print(f"Caption: {{caption}}")
'''
else:
return f'''# πŸ€— Datasets Library Access for {repo_id}
from datasets import load_dataset
import pandas as pd
# You may need: huggingface-cli login
ds = load_dataset("{repo_id}", split="train", streaming=True)
data = list(ds.take(1000))
df = pd.DataFrame(data)
# Search for: "{query}"
if "{query}":
string_cols = df.select_dtypes(include=['object', 'string']).columns
mask = pd.Series([False] * len(df))
for col in string_cols:
mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
df = df[mask]
print(f"Found {{len(df)}} results")
print(df.head())
'''
else:
return f"# Code generation for {access_method} not implemented yet"
# --- 🎣 Data Fetching & Processing Functions ---
def fetch_data(dataset_key: str, access_method: str, query: str):
"""
πŸš€ Main mission control. Always yields a tuple of 9 values to match the UI components.
"""
outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
req, res = None, None
try:
config = DATASET_CONFIG[dataset_key]
repo_id = config["name"]
# Generate code snippet
code_snippet = generate_code_snippet(dataset_key, access_method, query)
outputs[7] = code_snippet
if "API" in access_method:
all_results_df = pd.DataFrame()
MAX_PAGES = 5
PAGE_SIZE = 100
if not query:
MAX_PAGES = 1
outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
yield tuple(outputs)
for page in range(MAX_PAGES):
if query:
outputs[2] = f"⏳ Searching page {page + 1}..."
yield tuple(outputs)
offset = page * PAGE_SIZE
url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
headers = get_auth_headers() if not config["is_public"] else {}
res = requests.get(url, headers=headers)
req = res.request
res.raise_for_status()
data = res.json()
if not data.get('rows'):
outputs[2] = "🏁 No more data to search."
yield tuple(outputs)
break
# --- ✨ FIXED: JSON processing logic ---
# Extract the actual data from the 'row' key of each item in the list
rows_data = [item['row'] for item in data['rows']]
page_df = pd.json_normalize(rows_data)
found_in_page = search_dataframe(page_df, query)
if not found_in_page.empty:
all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
outputs[0] = all_results_df
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
if dataset_key == 'inscene':
try:
gallery_data = []
for _, row in all_results_df.iterrows():
if 'image' in row:
image_data = row.get('image')
text_data = row.get('text', '')
# Handle different image formats safely
if hasattr(image_data, 'save'): # PIL Image
gallery_data.append((image_data, text_data))
elif isinstance(image_data, str): # Image path or URL
gallery_data.append((image_data, text_data))
if gallery_data:
outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
except Exception as img_error:
# Don't break the flow for image errors
pass
yield tuple(outputs)
outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
yield tuple(outputs)
return
outputs[2] = f"⏳ Loading data via `{access_method}`..."
yield tuple(outputs)
df = pd.DataFrame()
if "Pandas" in access_method:
file_path = f"hf://datasets/{repo_id}/"
if repo_id == "fka/awesome-chatgpt-prompts":
file_path += "prompts.csv"
df = pd.read_csv(file_path)
else:
try:
df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
except:
try:
df = pd.read_parquet(f"{file_path}train.parquet")
except:
df = pd.read_json(f"{file_path}medical_o1_sft.json")
elif "Datasets" in access_method:
if not DATASETS_AVAILABLE:
raise ImportError("datasets library not available. Install with: pip install datasets")
# Special handling for image datasets
if dataset_key == 'inscene' and "Images" in access_method:
outputs[2] = "πŸ–ΌοΈ Loading InScene dataset with image processing..."
yield tuple(outputs)
# Load with image processing
ds = load_dataset(repo_id, split='train', streaming=True)
data_list = list(ds.take(50)) # Smaller sample for images
df = pd.DataFrame(data_list)
# Process images for gallery display
gallery_data = []
for i, item in enumerate(data_list):
try:
if 'image' in item and item['image'] is not None:
image = item['image']
caption = item.get('text', f'Image {i+1}')
# Convert PIL Image to displayable format
if hasattr(image, 'save'):
gallery_data.append((image, caption))
elif isinstance(image, str):
gallery_data.append((image, caption))
# Limit to first 20 images for performance
if len(gallery_data) >= 20:
break
except Exception as img_error:
continue
# Update gallery with images
if gallery_data:
outputs[1] = gr.Gallery(gallery_data, label=f"πŸ–ΌοΈ Found {len(gallery_data)} Images", height=400, columns=4, rows=2)
outputs[2] = f"πŸ–ΌοΈ Loaded {len(df)} records with {len(gallery_data)} images"
else:
outputs[2] = "πŸ–ΌοΈ Loaded data but no images found to display"
else:
# Regular datasets loading
ds = load_dataset(repo_id, split='train', streaming=True)
data_list = list(ds.take(1000))
df = pd.DataFrame(data_list)
outputs[2] = f"πŸ“š Loaded {len(df)} records via Datasets library"
outputs[2] = "πŸ” Searching loaded data..."
yield tuple(outputs)
final_df = search_dataframe(df, query)
outputs[0] = final_df
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
if dataset_key == 'inscene' and not final_df.empty:
# Handle image data more safely
try:
gallery_data = []
for _, row in final_df.iterrows():
if 'image' in row:
image_data = row.get('image')
text_data = row.get('text', '')
# Handle different image formats
if hasattr(image_data, 'save'): # PIL Image
gallery_data.append((image_data, text_data))
elif isinstance(image_data, str): # Image path or URL
gallery_data.append((image_data, text_data))
if gallery_data:
outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
except Exception as img_error:
outputs[2] += f"\n⚠️ Image display error: {str(img_error)}"
yield tuple(outputs)
except Exception as e:
yield handle_error(e, req, res)
# --- πŸ–ΌοΈ UI Generation ---
def create_dataset_tab(dataset_key: str):
config = DATASET_CONFIG[dataset_key]
with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
if not config['is_public']:
gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
# Show available methods for this dataset
available_methods = config['methods']
methods_note = f"**Available methods:** {len(available_methods)} tested and working methods"
if dataset_key == 'inscene':
methods_note += " (πŸ–ΌοΈ = Image viewer included)"
gr.Markdown(methods_note)
with gr.Row():
access_method = gr.Radio(
available_methods,
label="πŸ”‘ Access Method",
value=available_methods[0] if available_methods else "πŸ’¨ API (requests)"
)
query = gr.Textbox(
label="πŸ” Search Query",
placeholder="Enter any text to search, or leave blank for samples..."
)
fetch_button = gr.Button("πŸš€ Go Fetch!")
status_output = gr.Markdown("🏁 Ready to search.")
df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
# Show gallery for InScene dataset or when using image methods
show_gallery = (dataset_key == 'inscene')
gallery_output = gr.Gallery(visible=show_gallery, label="πŸ–ΌοΈ Image Results", height=400, columns=4, rows=2)
with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
markdown_output = gr.Markdown(label="πŸ“ Markdown View")
with gr.Row():
csv_output = gr.File(label="⬇️ Download CSV")
xlsx_output = gr.File(label="⬇️ Download XLSX")
copy_output = gr.Code(label="πŸ“‹ Copy-Paste (Tab-Delimited)")
code_output = gr.Code(label="πŸ’» Python Code Snippet", language="python")
debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
fetch_button.click(
fn=fetch_data,
inputs=[gr.State(dataset_key), access_method, query],
outputs=[
df_output, gallery_output, status_output, markdown_output,
csv_output, xlsx_output, copy_output, code_output,
debug_log_output
]
)
# --- πŸš€ Main App ---
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
gr.Markdown("# πŸ€— Hugging Face Dataset Explorer")
gr.Markdown(
"Select a dataset, choose an access method, and type a query. "
"If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
)
# Show dependency status and dataset-specific methods
def get_dependency_status():
status = "### πŸ”§ Dataset-Specific Methods (Only Working Methods Shown):\n"
for key, config in DATASET_CONFIG.items():
methods_str = ", ".join(config['methods'])
auth_status = "πŸ” Requires Auth" if not config['is_public'] else "βœ… Public"
status += f"- **{config['emoji']} {key.capitalize()}**: {methods_str} ({auth_status})\n"
status += "\n### πŸ“š Library Dependencies:\n"
status += f"- **🐼 Pandas**: βœ… Available\n"
status += f"- **πŸ’¨ Requests**: βœ… Available\n"
status += f"- **πŸ€— Datasets**: {'βœ… Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
return status
with gr.Accordion("πŸ”§ Library Status & Quick Start Guide", open=False):
gr.Markdown(get_dependency_status())
gr.Markdown("""
### πŸš€ Quick Start Guide:
1. **πŸ€– Prompts**: Try Pandas or API method, search for "translator", "linux", or "writer"
2. **βš–οΈ Caselaw**: Try API method only, search for "contract", "court", or "appeal"
3. **πŸ’° Finance**: Try Pandas or API method (requires auth), search for "interest" or "market"
4. **🩺 Medical**: Try Pandas method only (requires auth), search for "diagnosis" or "treatment"
5. **πŸ–ΌοΈ InScene**: Try "πŸ–ΌοΈ Datasets with Images" to see actual images, search for "kitchen" or "outdoor"
### πŸ”‘ Authentication:
For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login`
### πŸ› οΈ Method Explanations:
- **πŸ’¨ API**: Fast, reliable, works without login (100 rows max)
- **🐼 Pandas**: Full dataset access, requires login for gated datasets
- **πŸ€— Datasets**: Standard HuggingFace datasets library
- **πŸ–ΌοΈ Datasets with Images**: Special image viewer for InScene dataset
### ⚠️ Note:
Only working methods are shown for each dataset. Non-functional methods have been removed.
""")
if not DATASETS_AVAILABLE:
gr.Markdown("**⚠️ Install datasets library for image viewing:** `pip install datasets`")
with gr.Tabs():
for key in DATASET_CONFIG.keys():
create_dataset_tab(key)
if __name__ == "__main__":
demo.launch(debug=True)