carbon-footprint / carbon_footprint_gradio.py
willsh1997's picture
:memo: :bug: copy edit
293b845
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from transformers import pipeline
import pandas as pd
import gradio as gr
import os
import copy
import spaces
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, TextIteratorStreamer
from codecarbon import EmissionsTracker
# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
torch_dtype = torch.float32 #torch.float16 if torch_device in ["cuda", "mps"] else
model_name = "meta-llama/Llama-3.2-3B-Instruct"
llama_model=AutoModelForCausalLM.from_pretrained(model_name,
# quantization_config=quantization_config,
torch_dtype=torch.bfloat16, #cast to bfloat 16 for stability I guess
device_map=torch_device,
# load_in_4bit=True, #for puny devices like mine.
)
llama_tokenizer=AutoTokenizer.from_pretrained(model_name)
# streamer = TextStreamer(llama_tokenizer)
llama32_3b_pipe = pipeline(
"text-generation",
model=llama_model,
tokenizer=llama_tokenizer,
# streamer = streamer,
)
def calc_co2e_response(co2grams: float):
response = f"""Your query just produced approximately {co2grams:.4f} grams of CO2e.
On average, eight queries are sent per user per day to ChatGPT. If you were to run this specific query with our model (Llama 3.2 3b) seven more times, it would generate approximately {co2grams*8:.4f} grams of CO2e.
The average number of queries sent to ChatGPT per day globally is 1 billion. Using this model, running this query 1 billion times would equate to roughly {co2grams*1e3:.4f} metric tonnes of CO2e emissions.
"""
return response
@spaces.GPU
def llama32_3b_chat(message) -> str:
"simplifies pipeline output to only return generated text"
with EmissionsTracker() as tracker:
input_history = [{"role": "system", "content": """You are a helpful chatbot assistant. You will answer all queries as best you can.
"""}]
input_history.append({"role": "user", "content": f"{message}"})
##add sth about context window here
outputs = llama32_3b_pipe(
input_history,
max_new_tokens=512
)
co2grams = tracker.final_emissions*1000
return outputs[-1]['generated_text'][-1]['content'], calc_co2e_response(co2grams)
# Create the Gradio interface
def create_interface():
with gr.Blocks() as demo:
with gr.Row():
text_input = gr.Textbox(label="Query", value = "What is the meaning of life?")
with gr.Row():
submit_btn = gr.Button("Generate response")
with gr.Row():
text_output = gr.Textbox(interactive=False, label = "Response",show_label = True)
with gr.Row():
query_emissions_output = gr.Textbox(interactive=False, label = "Query Emissions",show_label = True)
submit_btn.click(
fn=llama32_3b_chat,
inputs=[text_input],
outputs=[text_output, query_emissions_output]
)
with gr.Accordion("How did we get these numbers?", open=False):
gr.Markdown("""## How emissions are calculated
This widget is currently running Llama 3.2 3b at 16-bit precision on a T4 instance on Huggingface, and uses [LLMCarbon](https://arxiv.org/abs/2309.14393) to measure the overall power usage of the model during inference. LLMCarbon takes into account the location of the inference endpoint being used, and the average emissions per kWh of their power grid. The first figure returned by the model is directly reported from the LLMCarbon python package, and directly reflects the CO2e generated from the query.
The CO2e per user per day (the second returned result) is based on the average number of ChatGPT queries per user per day, and is derived from figures reported by [demandsage](https://www.demandsage.com/chatgpt-statistics/) - which works out to approximately 8 queries per day.
The total CO2e per day (the final returned result) is based on the average number of overall queries per day globally, which, according to [demandsage](https://www.demandsage.com/chatgpt-statistics/) is over 1 billion per day.
**NOTE: commercial LLM deployments are likely to have significantly higher CO2e per query, given significantly higher parameter counts and proportionately more powerful hardware to run such large models. Rough approximations of their power usage based on smaller open-source models are difficult to make, given the lack of transparency around inference hardware, model architecture, and any other code or kernel-level optimisations being made by inference providers.**
""")
return demo
# Launch the app
demo = create_interface()
demo.launch()