Spaces:

willsh1997
/

carbon-footprint

Running on Zero

App Files Files Community

carbon-footprint / carbon_footprint_gradio.py

willsh1997

:memo: :bug: copy edit

293b845 about 1 month ago

raw

history blame contribute delete

5.05 kB

	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	import torch
	from transformers import pipeline
	import pandas as pd
	import gradio as gr
	import os
	import copy
	import spaces

	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, TextIteratorStreamer

	from codecarbon import EmissionsTracker

	# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
	torch_device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

	torch_dtype = torch.float32 #torch.float16 if torch_device in ["cuda", "mps"] else

	model_name = "meta-llama/Llama-3.2-3B-Instruct"

	llama_model=AutoModelForCausalLM.from_pretrained(model_name,
	# quantization_config=quantization_config,
	torch_dtype=torch.bfloat16, #cast to bfloat 16 for stability I guess
	device_map=torch_device,
	# load_in_4bit=True, #for puny devices like mine.
	)

	llama_tokenizer=AutoTokenizer.from_pretrained(model_name)

	# streamer = TextStreamer(llama_tokenizer)

	llama32_3b_pipe = pipeline(
	"text-generation",
	model=llama_model,
	tokenizer=llama_tokenizer,
	# streamer = streamer,
	)

	def calc_co2e_response(co2grams: float):
	response = f"""Your query just produced approximately {co2grams:.4f} grams of CO2e.

	On average, eight queries are sent per user per day to ChatGPT. If you were to run this specific query with our model (Llama 3.2 3b) seven more times, it would generate approximately {co2grams*8:.4f} grams of CO2e.

	The average number of queries sent to ChatGPT per day globally is 1 billion. Using this model, running this query 1 billion times would equate to roughly {co2grams*1e3:.4f} metric tonnes of CO2e emissions.
	"""
	return response

	@spaces.GPU
	def llama32_3b_chat(message) -> str:
	"simplifies pipeline output to only return generated text"
	with EmissionsTracker() as tracker:
	input_history = [{"role": "system", "content": """You are a helpful chatbot assistant. You will answer all queries as best you can.
	"""}]
	input_history.append({"role": "user", "content": f"{message}"})
	##add sth about context window here

	outputs = llama32_3b_pipe(
	input_history,
	max_new_tokens=512
	)
	co2grams = tracker.final_emissions*1000
	return outputs[-1]['generated_text'][-1]['content'], calc_co2e_response(co2grams)


	# Create the Gradio interface
	def create_interface():

	with gr.Blocks() as demo:
	with gr.Row():
	text_input = gr.Textbox(label="Query", value = "What is the meaning of life?")
	with gr.Row():
	submit_btn = gr.Button("Generate response")
	with gr.Row():
	text_output = gr.Textbox(interactive=False, label = "Response",show_label = True)
	with gr.Row():
	query_emissions_output = gr.Textbox(interactive=False, label = "Query Emissions",show_label = True)


	submit_btn.click(
	fn=llama32_3b_chat,
	inputs=[text_input],
	outputs=[text_output, query_emissions_output]
	)
	with gr.Accordion("How did we get these numbers?", open=False):
	gr.Markdown("""## How emissions are calculated
	This widget is currently running Llama 3.2 3b at 16-bit precision on a T4 instance on Huggingface, and uses [LLMCarbon](https://arxiv.org/abs/2309.14393) to measure the overall power usage of the model during inference. LLMCarbon takes into account the location of the inference endpoint being used, and the average emissions per kWh of their power grid. The first figure returned by the model is directly reported from the LLMCarbon python package, and directly reflects the CO2e generated from the query.

	The CO2e per user per day (the second returned result) is based on the average number of ChatGPT queries per user per day, and is derived from figures reported by [demandsage](https://www.demandsage.com/chatgpt-statistics/) - which works out to approximately 8 queries per day.

	The total CO2e per day (the final returned result) is based on the average number of overall queries per day globally, which, according to [demandsage](https://www.demandsage.com/chatgpt-statistics/) is over 1 billion per day.

	NOTE: commercial LLM deployments are likely to have significantly higher CO2e per query, given significantly higher parameter counts and proportionately more powerful hardware to run such large models. Rough approximations of their power usage based on smaller open-source models are difficult to make, given the lack of transparency around inference hardware, model architecture, and any other code or kernel-level optimisations being made by inference providers.
	""")


	return demo

	# Launch the app
	demo = create_interface()
	demo.launch()