# app.py import gradio as gr import pandas as pd import requests import io import warnings import traceback import json import tempfile import os import logging # 🤫 Suppress warnings and set logging levels warnings.filterwarnings("ignore") logging.getLogger("absl").setLevel(logging.ERROR) # Suppress MLCroissant warnings os.environ["ABSL_LOG_LEVEL"] = "2" # Only show errors # Import optional dependencies with fallbacks try: import dask.dataframe as dd DASK_AVAILABLE = True except ImportError: DASK_AVAILABLE = False try: from datasets import load_dataset, Image DATASETS_AVAILABLE = True except ImportError: DATASETS_AVAILABLE = False try: from mlcroissant import Dataset as CroissantDataset CROISSANT_AVAILABLE = True except ImportError: CROISSANT_AVAILABLE = False try: from huggingface_hub import get_token HF_HUB_AVAILABLE = True except ImportError: HF_HUB_AVAILABLE = False try: import polars as pl POLARS_AVAILABLE = True except ImportError: POLARS_AVAILABLE = False # --- āš™ļø Configuration & Constants --- DATASET_CONFIG = { "caselaw": { "name": "common-pile/caselaw_access_project", "emoji": "āš–ļø", "methods": ["šŸ’Ø API (requests)"], "is_public": True, }, "prompts": { "name": "fka/awesome-chatgpt-prompts", "emoji": "šŸ¤–", "methods": ["🐼 Pandas", "šŸ’Ø API (requests)"], "is_public": True, }, "finance": { "name": "snorkelai/agent-finance-reasoning", "emoji": "šŸ’°", "methods": ["🐼 Pandas", "šŸ’Ø API (requests)"], "is_public": False, }, "medical": { "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺", "methods": ["🐼 Pandas"], "is_public": False, }, "inscene": { "name": "peteromallet/InScene-Dataset", "emoji": "šŸ–¼ļø", "methods": ["šŸ¤— Datasets", "šŸ–¼ļø Datasets with Images"], "is_public": False, }, } # --- šŸ”§ Helpers & Utility Functions --- def get_auth_headers(): """šŸ”‘ Get authentication headers if available""" if not HF_HUB_AVAILABLE: return {} try: token = get_token() return {"Authorization": f"Bearer {token}"} if token else {} except Exception: return {} # --- ✨ FIXED: dataframe_to_outputs to use temporary files --- def dataframe_to_outputs(df: pd.DataFrame): """ šŸ“œ Takes a DataFrame and transforms it into various formats. Now uses temporary files for maximum Gradio compatibility. """ if df.empty: return "No results found. 🤷", None, None, "No results to copy." df_str = df.astype(str) markdown_output = df_str.to_markdown(index=False) # Create a temporary CSV file with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv: df.to_csv(tmp_csv.name, index=False) csv_path = tmp_csv.name # Create a temporary XLSX file with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx: df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl') xlsx_path = tmp_xlsx.name tab_delimited_output = df.to_csv(sep='\t', index=False) return ( markdown_output, csv_path, xlsx_path, tab_delimited_output, ) def handle_error(e: Exception, request=None, response=None): """ 😱 Oh no! An error! This function now creates a detailed debug log. """ error_message = f"🚨 An error occurred: {str(e)}\n" auth_tip = "šŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal." full_trace = traceback.format_exc() print(full_trace) if "401" in str(e) or "Gated" in str(e): error_message += auth_tip debug_log = f"""--- šŸž DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n""" if request: debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n""" if response is not None: try: response_text = json.dumps(response.json(), indent=2) except json.JSONDecodeError: response_text = response.text debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n""" return ( pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.", "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```", gr.Code(value=debug_log, visible=True) ) def search_dataframe(df: pd.DataFrame, query: str): if not query: return df.head(100) string_cols = df.select_dtypes(include=['object', 'string']).columns if string_cols.empty: return pd.DataFrame() mask = pd.Series([False] * len(df)) for col in string_cols: mask |= df[col].astype(str).str.contains(query, case=False, na=False) return df[mask] def generate_code_snippet(dataset_key: str, access_method: str, query: str): """ šŸ’» Generate Python code snippet for the current operation """ config = DATASET_CONFIG[dataset_key] repo_id = config["name"] if "API" in access_method: return f'''# 🌐 API Access for {repo_id} import requests import pandas as pd url = "https://datasets-server.huggingface.co/rows" params = {{ "dataset": "{repo_id}", "config": "default", "split": "train", "offset": 0, "length": 100 }} headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}} response = requests.get(url, params=params, headers=headers) if response.status_code == 200: data = response.json() rows_data = [item['row'] for item in data['rows']] df = pd.json_normalize(rows_data) # Search for: "{query}" if "{query}": string_cols = df.select_dtypes(include=['object', 'string']).columns mask = pd.Series([False] * len(df)) for col in string_cols: mask |= df[col].astype(str).str.contains("{query}", case=False, na=False) df = df[mask] print(f"Found {{len(df)}} results") print(df.head()) else: print(f"Error: {{response.status_code}} - {{response.text}}") ''' elif "Pandas" in access_method: file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet" read_function = "read_csv" if "csv" in file_path else "read_parquet" return f'''# 🐼 Pandas Access for {repo_id} import pandas as pd # You may need: huggingface-cli login df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}") # Search for: "{query}" if "{query}": string_cols = df.select_dtypes(include=['object', 'string']).columns mask = pd.Series([False] * len(df)) for col in string_cols: mask |= df[col].astype(str).str.contains("{query}", case=False, na=False) df = df[mask] print(f"Found {{len(df)}} results") print(df.head()) ''' elif "Datasets" in access_method: if "Images" in access_method: return f'''# šŸ–¼ļø Datasets Library with Image Access for {repo_id} from datasets import load_dataset import pandas as pd # You may need: huggingface-cli login ds = load_dataset("{repo_id}", split="train", streaming=True) data = list(ds.take(50)) # Smaller sample for images df = pd.DataFrame(data) # Process images images = [] for item in data: if 'image' in item and item['image'] is not None: images.append((item['image'], item.get('text', ''))) print(f"Found {{len(df)}} records with {{len(images)}} images") print(df.head()) # Display first image if images: first_image, caption = images[0] first_image.show() # If PIL Image print(f"Caption: {{caption}}") ''' else: return f'''# šŸ¤— Datasets Library Access for {repo_id} from datasets import load_dataset import pandas as pd # You may need: huggingface-cli login ds = load_dataset("{repo_id}", split="train", streaming=True) data = list(ds.take(1000)) df = pd.DataFrame(data) # Search for: "{query}" if "{query}": string_cols = df.select_dtypes(include=['object', 'string']).columns mask = pd.Series([False] * len(df)) for col in string_cols: mask |= df[col].astype(str).str.contains("{query}", case=False, na=False) df = df[mask] print(f"Found {{len(df)}} results") print(df.head()) ''' else: return f"# Code generation for {access_method} not implemented yet" # --- šŸŽ£ Data Fetching & Processing Functions --- def fetch_data(dataset_key: str, access_method: str, query: str): """ šŸš€ Main mission control. Always yields a tuple of 9 values to match the UI components. """ outputs = [pd.DataFrame(), None, "šŸ Ready.", "", None, None, "", "", gr.Code(visible=False)] req, res = None, None try: config = DATASET_CONFIG[dataset_key] repo_id = config["name"] # Generate code snippet code_snippet = generate_code_snippet(dataset_key, access_method, query) outputs[7] = code_snippet if "API" in access_method: all_results_df = pd.DataFrame() MAX_PAGES = 5 PAGE_SIZE = 100 if not query: MAX_PAGES = 1 outputs[2] = "ā³ No search term. Fetching first 100 records as a sample..." yield tuple(outputs) for page in range(MAX_PAGES): if query: outputs[2] = f"ā³ Searching page {page + 1}..." yield tuple(outputs) offset = page * PAGE_SIZE url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}" headers = get_auth_headers() if not config["is_public"] else {} res = requests.get(url, headers=headers) req = res.request res.raise_for_status() data = res.json() if not data.get('rows'): outputs[2] = "šŸ No more data to search." yield tuple(outputs) break # --- ✨ FIXED: JSON processing logic --- # Extract the actual data from the 'row' key of each item in the list rows_data = [item['row'] for item in data['rows']] page_df = pd.json_normalize(rows_data) found_in_page = search_dataframe(page_df, query) if not found_in_page.empty: all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True) outputs[0] = all_results_df outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df) outputs[2] = f"āœ… Found **{len(all_results_df)}** results so far..." if dataset_key == 'inscene': try: gallery_data = [] for _, row in all_results_df.iterrows(): if 'image' in row: image_data = row.get('image') text_data = row.get('text', '') # Handle different image formats safely if hasattr(image_data, 'save'): # PIL Image gallery_data.append((image_data, text_data)) elif isinstance(image_data, str): # Image path or URL gallery_data.append((image_data, text_data)) if gallery_data: outputs[1] = gr.Gallery(gallery_data, label="šŸ–¼ļø Image Results", height=400) except Exception as img_error: # Don't break the flow for image errors pass yield tuple(outputs) outputs[2] = f"šŸ Search complete. Found a total of **{len(all_results_df)}** results." yield tuple(outputs) return outputs[2] = f"ā³ Loading data via `{access_method}`..." yield tuple(outputs) df = pd.DataFrame() if "Pandas" in access_method: file_path = f"hf://datasets/{repo_id}/" if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv" df = pd.read_csv(file_path) else: try: df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet") except: try: df = pd.read_parquet(f"{file_path}train.parquet") except: df = pd.read_json(f"{file_path}medical_o1_sft.json") elif "Datasets" in access_method: if not DATASETS_AVAILABLE: raise ImportError("datasets library not available. Install with: pip install datasets") # Special handling for image datasets if dataset_key == 'inscene' and "Images" in access_method: outputs[2] = "šŸ–¼ļø Loading InScene dataset with image processing..." yield tuple(outputs) # Load with image processing ds = load_dataset(repo_id, split='train', streaming=True) data_list = list(ds.take(50)) # Smaller sample for images df = pd.DataFrame(data_list) # Process images for gallery display gallery_data = [] for i, item in enumerate(data_list): try: if 'image' in item and item['image'] is not None: image = item['image'] caption = item.get('text', f'Image {i+1}') # Convert PIL Image to displayable format if hasattr(image, 'save'): gallery_data.append((image, caption)) elif isinstance(image, str): gallery_data.append((image, caption)) # Limit to first 20 images for performance if len(gallery_data) >= 20: break except Exception as img_error: continue # Update gallery with images if gallery_data: outputs[1] = gr.Gallery(gallery_data, label=f"šŸ–¼ļø Found {len(gallery_data)} Images", height=400, columns=4, rows=2) outputs[2] = f"šŸ–¼ļø Loaded {len(df)} records with {len(gallery_data)} images" else: outputs[2] = "šŸ–¼ļø Loaded data but no images found to display" else: # Regular datasets loading ds = load_dataset(repo_id, split='train', streaming=True) data_list = list(ds.take(1000)) df = pd.DataFrame(data_list) outputs[2] = f"šŸ“š Loaded {len(df)} records via Datasets library" outputs[2] = "šŸ” Searching loaded data..." yield tuple(outputs) final_df = search_dataframe(df, query) outputs[0] = final_df outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df) outputs[2] = f"šŸ Search complete. Found **{len(final_df)}** results." if dataset_key == 'inscene' and not final_df.empty: # Handle image data more safely try: gallery_data = [] for _, row in final_df.iterrows(): if 'image' in row: image_data = row.get('image') text_data = row.get('text', '') # Handle different image formats if hasattr(image_data, 'save'): # PIL Image gallery_data.append((image_data, text_data)) elif isinstance(image_data, str): # Image path or URL gallery_data.append((image_data, text_data)) if gallery_data: outputs[1] = gr.Gallery(gallery_data, label="šŸ–¼ļø Image Results", height=400) except Exception as img_error: outputs[2] += f"\nāš ļø Image display error: {str(img_error)}" yield tuple(outputs) except Exception as e: yield handle_error(e, req, res) # --- šŸ–¼ļø UI Generation --- def create_dataset_tab(dataset_key: str): config = DATASET_CONFIG[dataset_key] with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"): gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset") if not config['is_public']: gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.") # Show available methods for this dataset available_methods = config['methods'] methods_note = f"**Available methods:** {len(available_methods)} tested and working methods" if dataset_key == 'inscene': methods_note += " (šŸ–¼ļø = Image viewer included)" gr.Markdown(methods_note) with gr.Row(): access_method = gr.Radio( available_methods, label="šŸ”‘ Access Method", value=available_methods[0] if available_methods else "šŸ’Ø API (requests)" ) query = gr.Textbox( label="šŸ” Search Query", placeholder="Enter any text to search, or leave blank for samples..." ) fetch_button = gr.Button("šŸš€ Go Fetch!") status_output = gr.Markdown("šŸ Ready to search.") df_output = gr.DataFrame(label="šŸ“Š Results DataFrame", interactive=False, wrap=True) # Show gallery for InScene dataset or when using image methods show_gallery = (dataset_key == 'inscene') gallery_output = gr.Gallery(visible=show_gallery, label="šŸ–¼ļø Image Results", height=400, columns=4, rows=2) with gr.Accordion("šŸ“‚ View/Export Full Results", open=False): markdown_output = gr.Markdown(label="šŸ“ Markdown View") with gr.Row(): csv_output = gr.File(label="ā¬‡ļø Download CSV") xlsx_output = gr.File(label="ā¬‡ļø Download XLSX") copy_output = gr.Code(label="šŸ“‹ Copy-Paste (Tab-Delimited)") code_output = gr.Code(label="šŸ’» Python Code Snippet", language="python") debug_log_output = gr.Code(label="šŸž Debug Log", visible=False) fetch_button.click( fn=fetch_data, inputs=[gr.State(dataset_key), access_method, query], outputs=[ df_output, gallery_output, status_output, markdown_output, csv_output, xlsx_output, copy_output, code_output, debug_log_output ] ) # --- šŸš€ Main App --- with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo: gr.Markdown("# šŸ¤— Hugging Face Dataset Explorer") gr.Markdown( "Select a dataset, choose an access method, and type a query. " "If an error occurs, a detailed debug log will appear to help troubleshoot the issue." ) # Show dependency status and dataset-specific methods def get_dependency_status(): status = "### šŸ”§ Dataset-Specific Methods (Only Working Methods Shown):\n" for key, config in DATASET_CONFIG.items(): methods_str = ", ".join(config['methods']) auth_status = "šŸ” Requires Auth" if not config['is_public'] else "āœ… Public" status += f"- **{config['emoji']} {key.capitalize()}**: {methods_str} ({auth_status})\n" status += "\n### šŸ“š Library Dependencies:\n" status += f"- **🐼 Pandas**: āœ… Available\n" status += f"- **šŸ’Ø Requests**: āœ… Available\n" status += f"- **šŸ¤— Datasets**: {'āœ… Available' if DATASETS_AVAILABLE else 'āŒ Not installed'}\n" return status with gr.Accordion("šŸ”§ Library Status & Quick Start Guide", open=False): gr.Markdown(get_dependency_status()) gr.Markdown(""" ### šŸš€ Quick Start Guide: 1. **šŸ¤– Prompts**: Try Pandas or API method, search for "translator", "linux", or "writer" 2. **āš–ļø Caselaw**: Try API method only, search for "contract", "court", or "appeal" 3. **šŸ’° Finance**: Try Pandas or API method (requires auth), search for "interest" or "market" 4. **🩺 Medical**: Try Pandas method only (requires auth), search for "diagnosis" or "treatment" 5. **šŸ–¼ļø InScene**: Try "šŸ–¼ļø Datasets with Images" to see actual images, search for "kitchen" or "outdoor" ### šŸ”‘ Authentication: For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login` ### šŸ› ļø Method Explanations: - **šŸ’Ø API**: Fast, reliable, works without login (100 rows max) - **🐼 Pandas**: Full dataset access, requires login for gated datasets - **šŸ¤— Datasets**: Standard HuggingFace datasets library - **šŸ–¼ļø Datasets with Images**: Special image viewer for InScene dataset ### āš ļø Note: Only working methods are shown for each dataset. Non-functional methods have been removed. """) if not DATASETS_AVAILABLE: gr.Markdown("**āš ļø Install datasets library for image viewing:** `pip install datasets`") with gr.Tabs(): for key in DATASET_CONFIG.keys(): create_dataset_tab(key) if __name__ == "__main__": demo.launch(debug=True)