import os import gradio as gr import pandas as pd import tensorflow as tf from tapas.scripts import prediction_utils from tapas.utils import number_annotation_utils from tapas.protos import interaction_pb2 # 1) Read CSV and build list-of-lists table import pandas as pd df = pd.read_csv("synthetic_profit.csv") # Ensure all values are strings df = df.astype(str) # Build TAPAS-style table: header row + data rows table = [list(df.columns)] + df.values.tolist() # 2) Configure TAPAS conversion with aggregation support from tapas.utils import example_utils as tf_example_utils config = tf_example_utils.ClassifierConversionConfig( vocab_file="tapas_sqa_base/vocab.txt", max_seq_length=512, max_column_id=512, max_row_id=512, strip_column_names=False, # Keep header names add_aggregation_candidates=True, # Propose SUM/AVERAGE operations ) converter = tf_example_utils.ToClassifierTensorflowExample(config) # 3) Helper: convert one interaction to model input def interaction_from_query(question: str): interaction = interaction_pb2.Interaction() # Add question q = interaction.questions.add() q.original_text = question # Add table columns for col in table[0]: interaction.table.columns.add().text = col # Add table rows/cells for row in table[1:]: r = interaction.table.rows.add() for cell in row: r.cells.add().text = cell # Annotate numeric values number_annotation_utils.add_numeric_values(interaction) return interaction # 4) Instantiate TAPAS model and tokenizer from transformers import TFAutoModelForSequenceClassification, AutoTokenizer MODEL = "google/tapas-base-finetuned-wtq" tokenizer = AutoTokenizer.from_pretrained(MODEL) model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) # 5) Prediction helper def predict_answer(question: str): interaction = interaction_from_query(question) # Convert to TensorFlowExample tf_example = converter.convert(interaction) # Run prediction result = model(tf_example.features) # Parse answer coordinates coords = prediction_utils.parse_coordinates(result.logits) # Map coordinates back to table cells answers = [] for r, c in coords: answers.append(table[r+1][c]) return ", ".join(answers) # 6) Gradio interface iface = gr.Interface( fn=predict_answer, inputs=gr.Textbox(lines=2, placeholder="Ask a question…"), outputs=gr.Textbox(lines=3), title="SAP Profitability Q&A (TAPAS Low-Level)", description=( "Low-level TAPAS: list-of-lists input, numeric annotations, " "aggregation candidates, and coordinate post-processing." ), allow_flagging="never", ) if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)