Spaces:

geolocation-from-speech-demo
/

geolocation-from-speech-demo-2

Running

App Files Files Community

geolocation-from-speech-demo commited on 30 days ago

Commit

57b83ef

verified ·

1 Parent(s): 68975a7

Create app.py

Browse files

Files changed (1) hide show

app.py +194 -0

app.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+import time
+import lhotse
+import numpy as np
+import os
+from transformers import Wav2Vec2ForCTC, Wav2Vec2ForPreTraining
+import gradio as gr
+import geoviews as gv
+import geoviews.tile_sources as gts
+import uuid
+import gdown
+import math
+import torch.nn as nn
+device = torch.device("cpu")
+class AttentionPool(nn.Module):
+    def __init__(self, att, query_embed):
+        super(AttentionPool, self).__init__()
+        self.query_embed = query_embed
+        self.att = att
+    def forward(self, x: torch.Tensor, x_lens: torch.Tensor) -> torch.Tensor:
+        # Create mask
+        max_seq_length = x_lens.max().item()
+        # Step 2: Create a binary mask
+        mask = torch.arange(max_seq_length)[None, :].to(x.device) >= x_lens[:, None]
+        # Step 3: Expand the mask to match the shape required by MultiheadAttention
+        # The mask should have shape (batch_size, 1, 1, max_seq_length)
+        x, w = self.att(
+            self.query_embed.unsqueeze(0).unsqueeze(1).repeat(x.size(0), 1, 1),
+            x,
+            x,
+            key_padding_mask=mask
+        )
+        x = x.squeeze(1)
+        return x, w
+class AveragePool(nn.Module):
+    def __init__(self):
+        super(AveragePool, self).__init__()
+    def forward(self, x: torch.Tensor, x_lens: torch.Tensor) -> torch.Tensor:
+        # Create mask
+        max_seq_length = x_lens.max().item()
+        # Step 2: Create a binary mask
+        mask = torch.arange(max_seq_length)[None, :].to(x.device) >= x_lens[:, None]
+        x[mask] = torch.nan
+        return x.nanmean(dim=1), None
+class Wav2Vec2Model(nn.Module):
+    def __init__(self,
+        modelpath='facebook/mms-300m',
+        freeze_feat_extractor=True,
+        pooling_loc=0,
+        pooling_type='att',
+    ):
+        super(Wav2Vec2Model, self).__init__()
+        try:
+            self.encoder = Wav2Vec2ForCTC.from_pretrained(modelpath).wav2vec2
+        except:
+            self.encoder = Wav2Vec2ForPreTraining.from_pretrained(modelpath).wav2vec2
+        if freeze_feat_extractor:
+            self.encoder.feature_extractor._freeze_parameters()
+        self.freeze_feat_extractor = freeze_feat_extractor
+        self.odim = self._get_output_dim()
+        self.frozen = False
+        if pooling_type == 'att':
+            assert pooling_loc == 0
+            self.att = nn.MultiheadAttention(self.odim, 1, batch_first=True)
+            self.loc_embed = nn.Parameter(
+                torch.FloatTensor(self.odim).uniform_(-1, 1)
+            )
+            self.pooling = AttentionPool(self.att, self.loc_embed)
+        elif pooling_type == 'avg':
+            self.pooling = AveragePool()
+        self.pooling_type = pooling_type
+        # pooling loc is on 0: embeddings 1: unnormalized coords, 2: normalized coords
+        self.pooling_loc = pooling_loc
+        self.linear_out = nn.Linear(self.odim, 3)
+    def forward(self, x: torch.Tensor, x_lens: torch.Tensor) -> torch.Tensor:
+        x = self.encoder(
+            x.squeeze(-1), output_hidden_states=False
+        )[0]
+        for width, stride in [(10, 5), (3, 2), (3, 2), (3, 2), (3, 2), (2, 2), (2, 2)]:
+            x_lens = torch.floor((x_lens - width) / stride + 1)
+        if self.pooling_loc == 0:
+            x, w = self.pooling(x, x_lens)
+            x = self.linear_out(x)
+            x = x.div(x.norm(dim=1).unsqueeze(-1))
+        elif self.pooling_loc == 1:
+            x = self.linear_out(x)
+            x, w = self.pooling(x, x_lens)
+            x = x.div(x.norm(dim=1).unsqueeze(-1))
+        elif self.pooling_loc == 2:
+            x = self.linear_out(x)
+            x = x.div(x.norm(dim=1).unsqueeze(-1))
+            x = self.pooling(x, x_lens)
+            x = x.div(x.norm(dim=1).unsqueeze(-1))
+        return x, w
+    def freeze_encoder(self):
+        for p in self.encoder.encoder.parameters():
+            if p.requires_grad:
+                p.requires_grad = False
+        self.frozen = True
+    def unfreeze_encoder(self):
+        for i, p in enumerate(self.encoder.encoder.parameters()):
+            p.requires_grad = True
+        if self.freeze_feat_extractor:
+            self.encoder.feature_extractor._freeze_parameters()
+        self.frozen = False
+    def _get_output_dim(self):
+        x = torch.rand(1, 400)
+        return self.encoder(x).last_hidden_state.size(-1)
+# download model checkpoint
+# bad way to do this probably but oh well
+if 'checkpoint.pt' not in os.listdir():
+    checkpoint_url = "https://drive.google.com/uc?id=162jJ_YC4MGEfXBWvAK-kXnZcXX3v1smr"
+    output = "checkpoint.pt"
+    gdown.download(checkpoint_url, output, quiet=False)
+model = Wav2Vec2Model()
+model.to(device)
+# load model checkpoint
+for f in os.listdir():
+    if '.pt' in f and 'checkpoint' in f:
+        checkpoint = torch.load(f, map_location=f'cpu')
+        model.load_state_dict(checkpoint)
+        model.eval()
+        print(f'Loaded state dict {f}')
+def predict(audio_path):
+    # get raw audio data
+    try:
+        a = lhotse.Recording.from_file(audio_path)
+    except:
+        return (None, "Please wait a bit until the audio file has uploaded, then try again")
+    a = a.resample(16000)
+    a = lhotse.cut.MultiCut(recording = a, start=0, duration=10, id="temp", channel=a.to_dict()['sources'][0]['channels']).to_mono(mono_downmix = True) # if multi channel, convert to single channel
+    cuts = lhotse.CutSet(cuts={"cut":a})
+    audio_data, audio_lens = lhotse.dataset.collation.collate_audio(cuts)
+    # pass through model
+    x, _ = model.forward(audio_data, audio_lens)
+    print(x)
+    pred_lon = torch.atan2(x[:, 0], x[:, 1]).unsqueeze(-1)
+    pred_lat = torch.asin(x[:, 2]).unsqueeze(-1)
+    x_polar = torch.cat((pred_lat, pred_lon), dim=1).to(device)
+    coords = x_polar.mul(180. / math.pi).cpu().detach().numpy()
+    print(coords)
+    coords = [[-lon, math.degrees(math.asin(math.sin(math.radians(lat))))] if lat > 90 else [lon, lat] for lat, lon in coords][0] # wraparound fix (lat > 90)
+    # create plot
+    guesses = gv.Points([coords]).opts(
+        size=8, cmap='Spectral_r', color='blue', fill_alpha=1
+        )
+    plot = (gts.OSM * guesses).options(
+        gv.opts.Points(width=800, height=400, xlim=(-180*110000, 180*110000), ylim=(-90*140000, 90*140000), xaxis=None, yaxis=None)
+        )
+    filename = f"{str(uuid.uuid4())}.png"
+    gv.save(plot, filename=filename, fmt='png')
+    coords = [round(i, 2) for i in coords]
+    coords = [coords[1], coords[0]]
+    print(filename, coords)
+    return (filename, str(coords)[1:-1])
+gradio_app = gr.Interface(
+    predict,
+    inputs=gr.Audio(label="Record Audio (10 seconds)", type="filepath", min_length=10.0),
+    outputs=[gr.Image(type="filepath", label="Map of Prediction"), gr.Textbox(placeholder="Latitude, Longitude", label="Prediction (Latitude, Longitude)")],
+    title="Speech Geolocation Demo",
+)
+if __name__ == "__main__":
+    gradio_app.launch()