File size: 11,403 Bytes
d1eb676
da59af8
 
 
 
 
 
 
 
 
 
 
 
d1eb676
da59af8
 
 
 
 
 
 
 
d1eb676
da59af8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eb676
da59af8
d1eb676
 
 
 
da59af8
 
d1eb676
da59af8
 
d1eb676
 
 
 
 
 
 
 
 
 
 
da59af8
d1eb676
da59af8
 
d1eb676
 
da59af8
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eb676
da59af8
d1eb676
da59af8
 
d1eb676
da59af8
 
d1eb676
da59af8
 
 
 
d1eb676
da59af8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eb676
da59af8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eb676
da59af8
 
 
 
 
 
 
d1eb676
 
 
 
 
da59af8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eb676
da59af8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eb676
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# app.py
import gradio as gr
import pandas as pd
import requests
import io
import dask.dataframe as dd
from datasets import load_dataset, Image
from mlcroissant import Dataset as CroissantDataset
from huggingface_hub import get_token
import polars as pl
import warnings
import traceback
import json
import tempfile # Added for creating temporary files

# 🀫 Let's ignore those pesky warnings, shall we?
warnings.filterwarnings("ignore")

# --- βš™οΈ Configuration & Constants ---
DATASET_CONFIG = {
    "caselaw": {
        "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
        "methods": ["πŸ’¨ API (requests)", "🧊 Dask", "πŸ₯ Croissant"], "is_public": True,
    },
    "prompts": {
        "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
        "methods": ["🐼 Pandas", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": True,
    },
    "finance": {
        "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
        "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
    },
    "medical": {
        "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
        "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
    },
    "inscene": {
        "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
        "methods": ["πŸ€— Datasets", "🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
    },
}

# --- ν—¬ Helpers & Utility Functions ---

def get_auth_headers():
    token = get_token()
    return {"Authorization": f"Bearer {token}"} if token else {}

# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
def dataframe_to_outputs(df: pd.DataFrame):
    """
    πŸ“œ Takes a DataFrame and transforms it into various formats.
    Now uses temporary files for maximum Gradio compatibility.
    """
    if df.empty:
        return "No results found. 🀷", None, None, "No results to copy."

    df_str = df.astype(str)
    markdown_output = df_str.to_markdown(index=False)
    
    # Create a temporary CSV file
    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
        df.to_csv(tmp_csv.name, index=False)
        csv_path = tmp_csv.name

    # Create a temporary XLSX file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
        df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
        xlsx_path = tmp_xlsx.name

    tab_delimited_output = df.to_csv(sep='\t', index=False)
    
    return (
        markdown_output,
        csv_path,
        xlsx_path,
        tab_delimited_output,
    )

def handle_error(e: Exception, request=None, response=None):
    """
    😱 Oh no! An error! This function now creates a detailed debug log.
    """
    error_message = f"🚨 An error occurred: {str(e)}\n"
    auth_tip = "πŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
    full_trace = traceback.format_exc()
    print(full_trace)
    if "401" in str(e) or "Gated" in str(e):
        error_message += auth_tip
    
    debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
    if request:
        debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
    if response is not None:
        try:
            response_text = json.dumps(response.json(), indent=2)
        except json.JSONDecodeError:
            response_text = response.text
        debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""
    
    return (
        pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
        "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
        gr.Code(value=debug_log, visible=True)
    )

def search_dataframe(df: pd.DataFrame, query: str):
    if not query:
        return df.head(100)
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    if string_cols.empty:
        return pd.DataFrame()
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains(query, case=False, na=False)
    return df[mask]

# --- 🎣 Data Fetching & Processing Functions ---
def fetch_data(dataset_key: str, access_method: str, query: str):
    """
    πŸš€ Main mission control. Always yields a tuple of 9 values to match the UI components.
    """
    outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
    req, res = None, None
    try:
        config = DATASET_CONFIG[dataset_key]
        repo_id = config["name"]
        
        if "API" in access_method:
            all_results_df = pd.DataFrame()
            MAX_PAGES = 5
            PAGE_SIZE = 100

            if not query:
                MAX_PAGES = 1
                outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
                yield tuple(outputs)

            for page in range(MAX_PAGES):
                if query:
                    outputs[2] = f"⏳ Searching page {page + 1}..."
                    yield tuple(outputs)
                
                offset = page * PAGE_SIZE
                url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
                headers = get_auth_headers() if not config["is_public"] else {}
                
                res = requests.get(url, headers=headers)
                req = res.request
                res.raise_for_status()
                data = res.json()

                if not data.get('rows'):
                    outputs[2] = "🏁 No more data to search."
                    yield tuple(outputs)
                    break

                # --- ✨ FIXED: JSON processing logic ---
                # Extract the actual data from the 'row' key of each item in the list
                rows_data = [item['row'] for item in data['rows']]
                page_df = pd.json_normalize(rows_data)
                
                found_in_page = search_dataframe(page_df, query)

                if not found_in_page.empty:
                    all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
                    outputs[0] = all_results_df
                    outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
                    outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
                    
                    if dataset_key == 'inscene':
                        gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
                        outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
                    yield tuple(outputs)

            outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
            yield tuple(outputs)
            return

        outputs[2] = f"⏳ Loading data via `{access_method}`..."
        yield tuple(outputs)
        
        df = pd.DataFrame()
        if "Pandas" in access_method:
            file_path = f"hf://datasets/{repo_id}/"
            if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv"; df = pd.read_csv(file_path)
            else: 
                try: df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
                except:
                     try: df = pd.read_parquet(f"{file_path}train.parquet")
                     except: df = pd.read_json(f"{file_path}medical_o1_sft.json")
        elif "Datasets" in access_method:
            ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
            df = pd.DataFrame(ds)
        
        outputs[2] = "πŸ” Searching loaded data..."
        yield tuple(outputs)

        final_df = search_dataframe(df, query)
        
        outputs[0] = final_df
        outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
        outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
        
        if dataset_key == 'inscene' and not final_df.empty:
            gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
            outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
        
        yield tuple(outputs)

    except Exception as e:
        yield handle_error(e, req, res)


# --- πŸ–ΌοΈ UI Generation ---
def create_dataset_tab(dataset_key: str):
    config = DATASET_CONFIG[dataset_key]
    
    with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
        gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
        if not config['is_public']:
            gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
        
        with gr.Row():
            access_method = gr.Radio(config['methods'], label="πŸ”‘ Access Method", value=config['methods'][0])
            query = gr.Textbox(label="πŸ” Search Query", placeholder="Enter any text to search, or leave blank for samples...")
        
        fetch_button = gr.Button("πŸš€ Go Fetch!")
        status_output = gr.Markdown("🏁 Ready to search.")
        df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
        gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πŸ–ΌοΈ Image Results")

        with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
            markdown_output = gr.Markdown(label="πŸ“ Markdown View")
            with gr.Row():
                csv_output = gr.File(label="⬇️ Download CSV")
                xlsx_output = gr.File(label="⬇️ Download XLSX")
            copy_output = gr.Code(label="πŸ“‹ Copy-Paste (Tab-Delimited)")
        
        code_output = gr.Code(label="πŸ’» Python Code Snippet", language="python")
        
        debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
        
        fetch_button.click(
            fn=fetch_data,
            inputs=[gr.State(dataset_key), access_method, query],
            outputs=[
                df_output, gallery_output, status_output, markdown_output,
                csv_output, xlsx_output, copy_output, code_output,
                debug_log_output
            ]
        )

# --- πŸš€ Main App ---
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
    gr.Markdown("# πŸ€— Hugging Face Dataset Explorer")
    gr.Markdown(
        "Select a dataset, choose an access method, and type a query. "
        "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
    )
    with gr.Tabs():
        for key in DATASET_CONFIG.keys():
            create_dataset_tab(key)

if __name__ == "__main__":
    demo.launch(debug=True)