Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Claude

Running

App Files Files Community

Gradio-Med-Law-Fin-Scene-Claude / app.py

awacke1

Update app.py

3537d14 verified 24 days ago

raw

history blame contribute delete

22.6 kB

	# app.py
	import gradio as gr
	import pandas as pd
	import requests
	import io
	import warnings
	import traceback
	import json
	import tempfile
	import os
	import logging

	# 🤫 Suppress warnings and set logging levels
	warnings.filterwarnings("ignore")
	logging.getLogger("absl").setLevel(logging.ERROR) # Suppress MLCroissant warnings
	os.environ["ABSL_LOG_LEVEL"] = "2" # Only show errors

	# Import optional dependencies with fallbacks
	try:
	import dask.dataframe as dd
	DASK_AVAILABLE = True
	except ImportError:
	DASK_AVAILABLE = False

	try:
	from datasets import load_dataset, Image
	DATASETS_AVAILABLE = True
	except ImportError:
	DATASETS_AVAILABLE = False

	try:
	from mlcroissant import Dataset as CroissantDataset
	CROISSANT_AVAILABLE = True
	except ImportError:
	CROISSANT_AVAILABLE = False

	try:
	from huggingface_hub import get_token
	HF_HUB_AVAILABLE = True
	except ImportError:
	HF_HUB_AVAILABLE = False

	try:
	import polars as pl
	POLARS_AVAILABLE = True
	except ImportError:
	POLARS_AVAILABLE = False

	# --- ⚙️ Configuration & Constants ---
	DATASET_CONFIG = {
	"caselaw": {
	"name": "common-pile/caselaw_access_project", "emoji": "⚖️",
	"methods": ["💨 API (requests)"], "is_public": True,
	},
	"prompts": {
	"name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
	"methods": ["🐼 Pandas", "💨 API (requests)"], "is_public": True,
	},
	"finance": {
	"name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
	"methods": ["🐼 Pandas", "💨 API (requests)"], "is_public": False,
	},
	"medical": {
	"name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
	"methods": ["🐼 Pandas"], "is_public": False,
	},
	"inscene": {
	"name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
	"methods": ["🤗 Datasets", "🖼️ Datasets with Images"], "is_public": False,
	},
	}

	# --- 🔧 Helpers & Utility Functions ---

	def get_auth_headers():
	"""🔑 Get authentication headers if available"""
	if not HF_HUB_AVAILABLE:
	return {}
	try:
	token = get_token()
	return {"Authorization": f"Bearer {token}"} if token else {}
	except Exception:
	return {}

	# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
	def dataframe_to_outputs(df: pd.DataFrame):
	"""
	📜 Takes a DataFrame and transforms it into various formats.
	Now uses temporary files for maximum Gradio compatibility.
	"""
	if df.empty:
	return "No results found. 🤷", None, None, "No results to copy."

	df_str = df.astype(str)
	markdown_output = df_str.to_markdown(index=False)

	# Create a temporary CSV file
	with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
	df.to_csv(tmp_csv.name, index=False)
	csv_path = tmp_csv.name

	# Create a temporary XLSX file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
	df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
	xlsx_path = tmp_xlsx.name

	tab_delimited_output = df.to_csv(sep='\t', index=False)

	return (
	markdown_output,
	csv_path,
	xlsx_path,
	tab_delimited_output,
	)

	def handle_error(e: Exception, request=None, response=None):
	"""
	😱 Oh no! An error! This function now creates a detailed debug log.
	"""
	error_message = f"🚨 An error occurred: {str(e)}\n"
	auth_tip = "🔑 For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
	full_trace = traceback.format_exc()
	print(full_trace)
	if "401" in str(e) or "Gated" in str(e):
	error_message += auth_tip

	debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
	if request:
	debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
	if response is not None:
	try:
	response_text = json.dumps(response.json(), indent=2)
	except json.JSONDecodeError:
	response_text = response.text
	debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""

	return (
	pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
	"", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
	gr.Code(value=debug_log, visible=True)
	)

	def search_dataframe(df: pd.DataFrame, query: str):
	if not query:
	return df.head(100)
	string_cols = df.select_dtypes(include=['object', 'string']).columns
	if string_cols.empty:
	return pd.DataFrame()
	mask = pd.Series([False] * len(df))
	for col in string_cols:
	mask \|= df[col].astype(str).str.contains(query, case=False, na=False)
	return df[mask]

	def generate_code_snippet(dataset_key: str, access_method: str, query: str):
	"""
	💻 Generate Python code snippet for the current operation
	"""
	config = DATASET_CONFIG[dataset_key]
	repo_id = config["name"]

	if "API" in access_method:
	return f'''# 🌐 API Access for {repo_id}
	import requests
	import pandas as pd

	url = "https://datasets-server.huggingface.co/rows"
	params = {{
	"dataset": "{repo_id}",
	"config": "default",
	"split": "train",
	"offset": 0,
	"length": 100
	}}

	headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}}
	response = requests.get(url, params=params, headers=headers)

	if response.status_code == 200:
	data = response.json()
	rows_data = [item['row'] for item in data['rows']]
	df = pd.json_normalize(rows_data)

	# Search for: "{query}"
	if "{query}":
	string_cols = df.select_dtypes(include=['object', 'string']).columns
	mask = pd.Series([False] * len(df))
	for col in string_cols:
	mask \|= df[col].astype(str).str.contains("{query}", case=False, na=False)
	df = df[mask]

	print(f"Found {{len(df)}} results")
	print(df.head())
	else:
	print(f"Error: {{response.status_code}} - {{response.text}}")
	'''

	elif "Pandas" in access_method:
	file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
	read_function = "read_csv" if "csv" in file_path else "read_parquet"

	return f'''# 🐼 Pandas Access for {repo_id}
	import pandas as pd

	# You may need: huggingface-cli login
	df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}")

	# Search for: "{query}"
	if "{query}":
	string_cols = df.select_dtypes(include=['object', 'string']).columns
	mask = pd.Series([False] * len(df))
	for col in string_cols:
	mask \|= df[col].astype(str).str.contains("{query}", case=False, na=False)
	df = df[mask]

	print(f"Found {{len(df)}} results")
	print(df.head())
	'''

	elif "Datasets" in access_method:
	if "Images" in access_method:
	return f'''# 🖼️ Datasets Library with Image Access for {repo_id}
	from datasets import load_dataset
	import pandas as pd

	# You may need: huggingface-cli login
	ds = load_dataset("{repo_id}", split="train", streaming=True)
	data = list(ds.take(50)) # Smaller sample for images
	df = pd.DataFrame(data)

	# Process images
	images = []
	for item in data:
	if 'image' in item and item['image'] is not None:
	images.append((item['image'], item.get('text', '')))

	print(f"Found {{len(df)}} records with {{len(images)}} images")
	print(df.head())

	# Display first image
	if images:
	first_image, caption = images[0]
	first_image.show() # If PIL Image
	print(f"Caption: {{caption}}")
	'''
	else:
	return f'''# 🤗 Datasets Library Access for {repo_id}
	from datasets import load_dataset
	import pandas as pd

	# You may need: huggingface-cli login
	ds = load_dataset("{repo_id}", split="train", streaming=True)
	data = list(ds.take(1000))
	df = pd.DataFrame(data)

	# Search for: "{query}"
	if "{query}":
	string_cols = df.select_dtypes(include=['object', 'string']).columns
	mask = pd.Series([False] * len(df))
	for col in string_cols:
	mask \|= df[col].astype(str).str.contains("{query}", case=False, na=False)
	df = df[mask]

	print(f"Found {{len(df)}} results")
	print(df.head())
	'''

	else:
	return f"# Code generation for {access_method} not implemented yet"

	# --- 🎣 Data Fetching & Processing Functions ---
	def fetch_data(dataset_key: str, access_method: str, query: str):
	"""
	🚀 Main mission control. Always yields a tuple of 9 values to match the UI components.
	"""
	outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
	req, res = None, None
	try:
	config = DATASET_CONFIG[dataset_key]
	repo_id = config["name"]

	# Generate code snippet
	code_snippet = generate_code_snippet(dataset_key, access_method, query)
	outputs[7] = code_snippet

	if "API" in access_method:
	all_results_df = pd.DataFrame()
	MAX_PAGES = 5
	PAGE_SIZE = 100

	if not query:
	MAX_PAGES = 1
	outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
	yield tuple(outputs)

	for page in range(MAX_PAGES):
	if query:
	outputs[2] = f"⏳ Searching page {page + 1}..."
	yield tuple(outputs)

	offset = page * PAGE_SIZE
	url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
	headers = get_auth_headers() if not config["is_public"] else {}

	res = requests.get(url, headers=headers)
	req = res.request
	res.raise_for_status()
	data = res.json()

	if not data.get('rows'):
	outputs[2] = "🏁 No more data to search."
	yield tuple(outputs)
	break

	# --- ✨ FIXED: JSON processing logic ---
	# Extract the actual data from the 'row' key of each item in the list
	rows_data = [item['row'] for item in data['rows']]
	page_df = pd.json_normalize(rows_data)

	found_in_page = search_dataframe(page_df, query)

	if not found_in_page.empty:
	all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
	outputs[0] = all_results_df
	outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
	outputs[2] = f"✅ Found {len(all_results_df)} results so far..."

	if dataset_key == 'inscene':
	try:
	gallery_data = []
	for _, row in all_results_df.iterrows():
	if 'image' in row:
	image_data = row.get('image')
	text_data = row.get('text', '')

	# Handle different image formats safely
	if hasattr(image_data, 'save'): # PIL Image
	gallery_data.append((image_data, text_data))
	elif isinstance(image_data, str): # Image path or URL
	gallery_data.append((image_data, text_data))

	if gallery_data:
	outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
	except Exception as img_error:
	# Don't break the flow for image errors
	pass
	yield tuple(outputs)

	outputs[2] = f"🏁 Search complete. Found a total of {len(all_results_df)} results."
	yield tuple(outputs)
	return

	outputs[2] = f"⏳ Loading data via `{access_method}`..."
	yield tuple(outputs)

	df = pd.DataFrame()

	if "Pandas" in access_method:
	file_path = f"hf://datasets/{repo_id}/"
	if repo_id == "fka/awesome-chatgpt-prompts":
	file_path += "prompts.csv"
	df = pd.read_csv(file_path)
	else:
	try:
	df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
	except:
	try:
	df = pd.read_parquet(f"{file_path}train.parquet")
	except:
	df = pd.read_json(f"{file_path}medical_o1_sft.json")

	elif "Datasets" in access_method:
	if not DATASETS_AVAILABLE:
	raise ImportError("datasets library not available. Install with: pip install datasets")

	# Special handling for image datasets
	if dataset_key == 'inscene' and "Images" in access_method:
	outputs[2] = "🖼️ Loading InScene dataset with image processing..."
	yield tuple(outputs)

	# Load with image processing
	ds = load_dataset(repo_id, split='train', streaming=True)
	data_list = list(ds.take(50)) # Smaller sample for images
	df = pd.DataFrame(data_list)

	# Process images for gallery display
	gallery_data = []
	for i, item in enumerate(data_list):
	try:
	if 'image' in item and item['image'] is not None:
	image = item['image']
	caption = item.get('text', f'Image {i+1}')

	# Convert PIL Image to displayable format
	if hasattr(image, 'save'):
	gallery_data.append((image, caption))
	elif isinstance(image, str):
	gallery_data.append((image, caption))

	# Limit to first 20 images for performance
	if len(gallery_data) >= 20:
	break

	except Exception as img_error:
	continue

	# Update gallery with images
	if gallery_data:
	outputs[1] = gr.Gallery(gallery_data, label=f"🖼️ Found {len(gallery_data)} Images", height=400, columns=4, rows=2)
	outputs[2] = f"🖼️ Loaded {len(df)} records with {len(gallery_data)} images"
	else:
	outputs[2] = "🖼️ Loaded data but no images found to display"

	else:
	# Regular datasets loading
	ds = load_dataset(repo_id, split='train', streaming=True)
	data_list = list(ds.take(1000))
	df = pd.DataFrame(data_list)
	outputs[2] = f"📚 Loaded {len(df)} records via Datasets library"

	outputs[2] = "🔍 Searching loaded data..."
	yield tuple(outputs)

	final_df = search_dataframe(df, query)

	outputs[0] = final_df
	outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
	outputs[2] = f"🏁 Search complete. Found {len(final_df)} results."

	if dataset_key == 'inscene' and not final_df.empty:
	# Handle image data more safely
	try:
	gallery_data = []
	for _, row in final_df.iterrows():
	if 'image' in row:
	image_data = row.get('image')
	text_data = row.get('text', '')

	# Handle different image formats
	if hasattr(image_data, 'save'): # PIL Image
	gallery_data.append((image_data, text_data))
	elif isinstance(image_data, str): # Image path or URL
	gallery_data.append((image_data, text_data))

	if gallery_data:
	outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
	except Exception as img_error:
	outputs[2] += f"\n⚠️ Image display error: {str(img_error)}"

	yield tuple(outputs)

	except Exception as e:
	yield handle_error(e, req, res)


	# --- 🖼️ UI Generation ---
	def create_dataset_tab(dataset_key: str):
	config = DATASET_CONFIG[dataset_key]

	with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
	gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
	if not config['is_public']:
	gr.Markdown("Note: This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")

	# Show available methods for this dataset
	available_methods = config['methods']
	methods_note = f"Available methods: {len(available_methods)} tested and working methods"
	if dataset_key == 'inscene':
	methods_note += " (🖼️ = Image viewer included)"
	gr.Markdown(methods_note)

	with gr.Row():
	access_method = gr.Radio(
	available_methods,
	label="🔑 Access Method",
	value=available_methods[0] if available_methods else "💨 API (requests)"
	)
	query = gr.Textbox(
	label="🔍 Search Query",
	placeholder="Enter any text to search, or leave blank for samples..."
	)

	fetch_button = gr.Button("🚀 Go Fetch!")
	status_output = gr.Markdown("🏁 Ready to search.")
	df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)

	# Show gallery for InScene dataset or when using image methods
	show_gallery = (dataset_key == 'inscene')
	gallery_output = gr.Gallery(visible=show_gallery, label="🖼️ Image Results", height=400, columns=4, rows=2)

	with gr.Accordion("📂 View/Export Full Results", open=False):
	markdown_output = gr.Markdown(label="📝 Markdown View")
	with gr.Row():
	csv_output = gr.File(label="⬇️ Download CSV")
	xlsx_output = gr.File(label="⬇️ Download XLSX")
	copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")

	code_output = gr.Code(label="💻 Python Code Snippet", language="python")

	debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)

	fetch_button.click(
	fn=fetch_data,
	inputs=[gr.State(dataset_key), access_method, query],
	outputs=[
	df_output, gallery_output, status_output, markdown_output,
	csv_output, xlsx_output, copy_output, code_output,
	debug_log_output
	]
	)

	# --- 🚀 Main App ---
	with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
	gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
	gr.Markdown(
	"Select a dataset, choose an access method, and type a query. "
	"If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
	)

	# Show dependency status and dataset-specific methods
	def get_dependency_status():
	status = "### 🔧 Dataset-Specific Methods (Only Working Methods Shown):\n"
	for key, config in DATASET_CONFIG.items():
	methods_str = ", ".join(config['methods'])
	auth_status = "🔐 Requires Auth" if not config['is_public'] else "✅ Public"
	status += f"- {config['emoji']} {key.capitalize()}: {methods_str} ({auth_status})\n"

	status += "\n### 📚 Library Dependencies:\n"
	status += f"- 🐼 Pandas: ✅ Available\n"
	status += f"- 💨 Requests: ✅ Available\n"
	status += f"- 🤗 Datasets: {'✅ Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"

	return status

	with gr.Accordion("🔧 Library Status & Quick Start Guide", open=False):
	gr.Markdown(get_dependency_status())
	gr.Markdown("""
	### 🚀 Quick Start Guide:
	1. 🤖 Prompts: Try Pandas or API method, search for "translator", "linux", or "writer"
	2. ⚖️ Caselaw: Try API method only, search for "contract", "court", or "appeal"
	3. 💰 Finance: Try Pandas or API method (requires auth), search for "interest" or "market"
	4. 🩺 Medical: Try Pandas method only (requires auth), search for "diagnosis" or "treatment"
	5. 🖼️ InScene: Try "🖼️ Datasets with Images" to see actual images, search for "kitchen" or "outdoor"

	### 🔑 Authentication:
	For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login`

	### 🛠️ Method Explanations:
	- 💨 API: Fast, reliable, works without login (100 rows max)
	- 🐼 Pandas: Full dataset access, requires login for gated datasets
	- 🤗 Datasets: Standard HuggingFace datasets library
	- 🖼️ Datasets with Images: Special image viewer for InScene dataset

	### ⚠️ Note:
	Only working methods are shown for each dataset. Non-functional methods have been removed.
	""")

	if not DATASETS_AVAILABLE:
	gr.Markdown("⚠️ Install datasets library for image viewing: `pip install datasets`")

	with gr.Tabs():
	for key in DATASET_CONFIG.keys():
	create_dataset_tab(key)

	if __name__ == "__main__":
	demo.launch(debug=True)