File size: 5,049 Bytes
0b66803
 
 
 
 
 
 
 
 
 
 
 
 
 
ce8bef4
0b66803
2b43961
0b66803
2b43961
 
 
0b66803
2b43961
0b66803
2b43961
 
0b66803
2b43961
0b66803
 
 
 
 
 
 
 
 
 
ce8bef4
388f2fa
ce8bef4
293b845
ce8bef4
388f2fa
ce8bef4
 
 
0b66803
 
 
 
2cdd8fc
0b66803
 
 
 
 
 
 
 
ce8bef4
 
0b66803
 
 
 
 
 
 
2cdd8fc
0b66803
2cdd8fc
0b66803
ce8bef4
0b66803
ce8bef4
 
0b66803
 
 
 
ce8bef4
0b66803
99c632e
ce8bef4
 
 
 
 
 
 
 
 
 
0b66803
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from transformers import pipeline
import pandas as pd
import gradio as gr
import os
import copy
import spaces

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, TextIteratorStreamer

from codecarbon import EmissionsTracker

# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

torch_dtype = torch.float32 #torch.float16 if torch_device in ["cuda", "mps"] else 

model_name = "meta-llama/Llama-3.2-3B-Instruct"

llama_model=AutoModelForCausalLM.from_pretrained(model_name, 
                                           #  quantization_config=quantization_config, 
                                           torch_dtype=torch.bfloat16, #cast to bfloat 16 for stability I guess 
                                           device_map=torch_device,
                                            # load_in_4bit=True, #for puny devices like mine.
                                                ) 

llama_tokenizer=AutoTokenizer.from_pretrained(model_name)

# streamer = TextStreamer(llama_tokenizer)

llama32_3b_pipe = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    # streamer = streamer,
)

def calc_co2e_response(co2grams: float):
    response = f"""Your query just produced approximately {co2grams:.4f} grams of CO2e.
    
On average, eight queries are sent per user per day to ChatGPT. If you were to run this specific query with our model (Llama 3.2 3b) seven more times, it would generate approximately {co2grams*8:.4f} grams of CO2e.

The average number of queries sent to ChatGPT per day globally is 1 billion. Using this model, running this query 1 billion times would equate to roughly {co2grams*1e3:.4f} metric tonnes of CO2e emissions.
    """
    return response

@spaces.GPU
def llama32_3b_chat(message) -> str: 
    "simplifies pipeline output to only return generated text"
    with EmissionsTracker() as tracker:
        input_history = [{"role": "system", "content": """You are a helpful chatbot assistant. You will answer all queries as best you can.
        """}]
        input_history.append({"role": "user", "content": f"{message}"})
        ##add sth about context window here
    
        outputs = llama32_3b_pipe(
            input_history,
            max_new_tokens=512
        )
    co2grams = tracker.final_emissions*1000
    return outputs[-1]['generated_text'][-1]['content'], calc_co2e_response(co2grams)


# Create the Gradio interface
def create_interface():
    
    with gr.Blocks() as demo:
        with gr.Row():
            text_input = gr.Textbox(label="Query", value = "What is the meaning of life?")
        with gr.Row():
            submit_btn = gr.Button("Generate response")
        with gr.Row():
            text_output = gr.Textbox(interactive=False, label = "Response",show_label = True)
        with gr.Row():
            query_emissions_output = gr.Textbox(interactive=False, label = "Query Emissions",show_label = True)


        submit_btn.click(
            fn=llama32_3b_chat,
            inputs=[text_input],
            outputs=[text_output, query_emissions_output]
        )
        with gr.Accordion("How did we get these numbers?", open=False):
            gr.Markdown("""## How emissions are calculated
This widget is currently running Llama 3.2 3b at 16-bit precision on a T4 instance on Huggingface, and uses [LLMCarbon](https://arxiv.org/abs/2309.14393) to measure the overall power usage of the model during inference. LLMCarbon takes into account the location of the inference endpoint being used, and the average emissions per kWh of their power grid. The first figure returned by the model is directly reported from the LLMCarbon python package, and directly reflects the CO2e generated from the query.

The CO2e per user per day (the second returned result) is based on the average number of ChatGPT queries per user per day, and is derived from figures reported by [demandsage](https://www.demandsage.com/chatgpt-statistics/) - which works out to approximately 8 queries per day.

The total CO2e per day (the final returned result) is based on the average number of overall queries per day globally, which, according to [demandsage](https://www.demandsage.com/chatgpt-statistics/) is over 1 billion per day.

**NOTE: commercial LLM deployments are likely to have significantly higher CO2e per query, given significantly higher parameter counts and proportionately more powerful hardware to run such large models. Rough approximations of their power usage based on smaller open-source models are difficult to make, given the lack of transparency around inference hardware, model architecture, and any other code or kernel-level optimisations being made by inference providers.**
            """)
            
    
    return demo

# Launch the app
demo = create_interface()
demo.launch()