Spaces:
Running
on
Zero
Running
on
Zero
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
import torch | |
from transformers import pipeline | |
import pandas as pd | |
import gradio as gr | |
import os | |
import copy | |
import spaces | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, TextIteratorStreamer | |
from codecarbon import EmissionsTracker | |
# quantization_config = BitsAndBytesConfig(load_in_4bit=True) | |
torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") | |
torch_dtype = torch.float32 #torch.float16 if torch_device in ["cuda", "mps"] else | |
model_name = "meta-llama/Llama-3.2-3B-Instruct" | |
llama_model=AutoModelForCausalLM.from_pretrained(model_name, | |
# quantization_config=quantization_config, | |
torch_dtype=torch.bfloat16, #cast to bfloat 16 for stability I guess | |
device_map=torch_device, | |
# load_in_4bit=True, #for puny devices like mine. | |
) | |
llama_tokenizer=AutoTokenizer.from_pretrained(model_name) | |
# streamer = TextStreamer(llama_tokenizer) | |
llama32_3b_pipe = pipeline( | |
"text-generation", | |
model=llama_model, | |
tokenizer=llama_tokenizer, | |
# streamer = streamer, | |
) | |
def calc_co2e_response(co2grams: float): | |
response = f"""Your query just produced approximately {co2grams:.4f} grams of CO2e. | |
On average, eight queries are sent per user per day to ChatGPT. If you were to run this specific query with our model (Llama 3.2 3b) seven more times, it would generate approximately {co2grams*8:.4f} grams of CO2e. | |
The average number of queries sent to ChatGPT per day globally is 1 billion. Using this model, running this query 1 billion times would equate to roughly {co2grams*1e3:.4f} metric tonnes of CO2e emissions. | |
""" | |
return response | |
def llama32_3b_chat(message) -> str: | |
"simplifies pipeline output to only return generated text" | |
with EmissionsTracker() as tracker: | |
input_history = [{"role": "system", "content": """You are a helpful chatbot assistant. You will answer all queries as best you can. | |
"""}] | |
input_history.append({"role": "user", "content": f"{message}"}) | |
##add sth about context window here | |
outputs = llama32_3b_pipe( | |
input_history, | |
max_new_tokens=512 | |
) | |
co2grams = tracker.final_emissions*1000 | |
return outputs[-1]['generated_text'][-1]['content'], calc_co2e_response(co2grams) | |
# Create the Gradio interface | |
def create_interface(): | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
text_input = gr.Textbox(label="Query", value = "What is the meaning of life?") | |
with gr.Row(): | |
submit_btn = gr.Button("Generate response") | |
with gr.Row(): | |
text_output = gr.Textbox(interactive=False, label = "Response",show_label = True) | |
with gr.Row(): | |
query_emissions_output = gr.Textbox(interactive=False, label = "Query Emissions",show_label = True) | |
submit_btn.click( | |
fn=llama32_3b_chat, | |
inputs=[text_input], | |
outputs=[text_output, query_emissions_output] | |
) | |
with gr.Accordion("How did we get these numbers?", open=False): | |
gr.Markdown("""## How emissions are calculated | |
This widget is currently running Llama 3.2 3b at 16-bit precision on a T4 instance on Huggingface, and uses [LLMCarbon](https://arxiv.org/abs/2309.14393) to measure the overall power usage of the model during inference. LLMCarbon takes into account the location of the inference endpoint being used, and the average emissions per kWh of their power grid. The first figure returned by the model is directly reported from the LLMCarbon python package, and directly reflects the CO2e generated from the query. | |
The CO2e per user per day (the second returned result) is based on the average number of ChatGPT queries per user per day, and is derived from figures reported by [demandsage](https://www.demandsage.com/chatgpt-statistics/) - which works out to approximately 8 queries per day. | |
The total CO2e per day (the final returned result) is based on the average number of overall queries per day globally, which, according to [demandsage](https://www.demandsage.com/chatgpt-statistics/) is over 1 billion per day. | |
**NOTE: commercial LLM deployments are likely to have significantly higher CO2e per query, given significantly higher parameter counts and proportionately more powerful hardware to run such large models. Rough approximations of their power usage based on smaller open-source models are difficult to make, given the lack of transparency around inference hardware, model architecture, and any other code or kernel-level optimisations being made by inference providers.** | |
""") | |
return demo | |
# Launch the app | |
demo = create_interface() | |
demo.launch() | |