import os import gradio as gr from PIL import Image import torch from transformers import ViTForImageClassification, ViTImageProcessor from datasets import load_dataset import matplotlib.pyplot as plt import numpy as np import cv2 # Model and processor configuration model_name_or_path = "google/vit-base-patch16-224-in21k" processor = ViTImageProcessor.from_pretrained(model_name_or_path) # Load dataset (adjust dataset_path accordingly) dataset_path = "pawlo2013/chest_xray" train_dataset = load_dataset(dataset_path, split="train") class_names = train_dataset.features["label"].names # Load ViT model model = ViTForImageClassification.from_pretrained( "./models", num_labels=len(class_names), id2label={str(i): label for i, label in enumerate(class_names)}, label2id={label: i for i, label in enumerate(class_names)}, ) # Set model to evaluation mode model.eval() # Define the classification function # Define the classification function def classify_and_visualize( img_path, device="cpu", discard_ratio=0.9, head_fusion="mean" ): img = Image.open(img_path).convert("RGB") processed_input = processor(images=img, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**processed_input) logits = outputs.logits probabilities = torch.softmax(logits, dim=1)[0].tolist() prediction = torch.argmax(logits, dim=-1).item() predicted_class = class_names[prediction] result = {class_name: prob for class_name, prob in zip(class_names, probabilities)} filename = os.path.basename(img_path).split(".")[0] # Generate attention heatmap heatmap_img = show_final_layer_attention_maps( model, processed_input, device, discard_ratio, head_fusion ) return {"filename": filename, "probabilities": result, "heatmap": heatmap_img} def format_output(output): return ( f"{output['filename']}", output["probabilities"], gr.Image(value=output["heatmap"]), ) # Function to load examples from a folder def load_examples_from_folder(folder_path): examples = [] for file in os.listdir(folder_path): if file.endswith((".png", ".jpg", ".jpeg")): examples.append(os.path.join(folder_path, file)) return examples # Function to show final layer attention maps def show_final_layer_attention_maps( model, tensor, device, discard_ratio=0.6, head_fusion="max", only_last_layer=False ): # Create a DataLoader with batch size equal to the number of images image = tensor["pixel_values"].to(device).squeeze(0) # Iterate over the samples with torch.no_grad(): # Forward pass through the model outputs = model(**tensor, output_attentions=True) # Scale image to [0, 1] image = image - image.min() image = image / image.max() # Initialize the result tensor and recursively fuse the attention maps result = torch.eye(outputs.attentions[0].size(-1)).to(device) if only_last_layer: attention_list = outputs.attentions[-1].unsqueeze(0).to(device) else: attention_list = outputs.attentions for attention in attention_list: if head_fusion == "mean": attention_heads_fused = attention.mean(axis=1) elif head_fusion == "max": attention_heads_fused = attention.max(axis=1)[0] elif head_fusion == "min": attention_heads_fused = attention.min(axis=1)[0] flat = attention_heads_fused.view(attention_heads_fused.size(0), -1) _, indices = flat.topk(int(flat.size(-1) * discard_ratio), -1, False) indices = indices[indices != 0] flat[0, indices] = 0 I = torch.eye(attention_heads_fused.size(-1)).to(device) a = (attention_heads_fused + 1.0 * I) / 2 a = a / a.sum(dim=-1) result = torch.matmul(a, result) mask = result[0, 0, 1:] # In case of 224x224 image, this brings us from 196 to 14 width = int(mask.size(-1) ** 0.5) mask = mask.reshape(width, width).cpu().numpy() mask = mask / np.max(mask) mask = cv2.resize(mask, (224, 224)) # Normalize mask to [0, 1] for visualization mask = (mask - np.min(mask)) / (np.max(mask) - np.min(mask)) heatmap = plt.cm.jet(mask)[:, :, :3] # Apply colormap # Superimpose heatmap on the original image showed_img = image.permute(1, 2, 0).detach().cpu().numpy() showed_img = (showed_img - np.min(showed_img)) / ( np.max(showed_img) - np.min(showed_img) ) # Normalize image superimposed_img = ( heatmap * 0.4 + showed_img * 0.6 ) # Combine heatmap with original image # Plot attention map superimposed_img_pil = Image.fromarray( (superimposed_img * 255).astype(np.uint8) ) return superimposed_img_pil # Define the path to the examples folder examples_folder = "./examples" examples = load_examples_from_folder(examples_folder) # Create the Gradio interface iface = gr.Interface( fn=lambda img: format_output(classify_and_visualize(img)), inputs=gr.Image(type="filepath"), outputs=[ gr.Textbox(label="True Label (from filename)"), gr.Label(), gr.Image(label="Attention Heatmap"), ], examples=examples, title="Pneumonia X-Ray 3-Class Classification with Vision Transformer (ViT) using data augmentation", description="Upload an X-ray image to classify it as normal, viral or bacterial pneumonia. Checkout the model in more details [here](https://huggingface.co/pawlo2013/vit-pneumonia-x-ray_3_class). The examples presented are taken from the test set of [Kermany et al. (2018) dataset.](https://data.mendeley.com/datasets/rscbjbr9sj/2.) The attention heatmap over all layers of the transfomer done by the attention rollout techinique by the implementation of [jacobgil](https://github.com/jacobgil/vit-explain).", ) # Launch the app if __name__ == "__main__": iface.launch()