from typing import Any import gradio as gr import pandas as pd import json import requests from html.parser import HTMLParser quants = { "Q2_K": 3.35, "Q3_K_S": 3.5, "Q3_K_M": 3.91, "Q3_K_L": 4.27, "Q4_0": 4.55, "Q4_K_S": 4.58, "Q4_K_M": 4.85, "Q5_0": 5.54, "Q5_K_S": 5.54, "Q5_K_M": 5.69, "Q6_K": 6.59, "Q8_0": 8.5, } class SvelteHydratorExtractor(HTMLParser): def __init__(self): self.data = None super().__init__() def handle_starttag(self, tag, attrs): print("Start tag:", tag) for attr in attrs: if attr[0] == "data-props": self.data = attr[1].replace("":", '"') def calc_model_size(parameters: int, quant: float) -> int: return parameters * quant // 8 def get_model_config(hf_model: str) -> dict[str, Any]: config = requests.get( f"https://huggingface.co/{hf_model}/raw/main/config.json" ).json() model_size = 0 try: model_size = requests.get( f"https://huggingface.co/{hf_model}/raw/main/model.safetensors.index.json" ).json()["metadta"]["total_size"] except: try: model_size = requests.get( f"https://huggingface.co/{hf_model}/raw/main/pytorch_model.bin.index.json" ).json()["metadta"]["total_size"] except: model_page = requests.get( f"https://huggingface.co/{hf_model}" ).text param_props_idx = model_page.find('data-target="ModelSafetensorsParams"') if param_props_idx != -1: param_props_start = model_page.rfind("", param_props_idx) extractor = SvelteHydratorExtractor() extractor.feed(model_page[param_props_start:param_props_end + 1]) model_size = ( json.loads( extractor.data )["safetensors"]["total"] * 2 ) else: param_props_idx = model_page.find('data-target="ModelHeader"') param_props_start = model_page.rfind("", param_props_idx) extractor = SvelteHydratorExtractor() extractor.feed(model_page[param_props_start:param_props_end + 1]) model_size = ( json.loads( extractor.data )["model"]["safetensors"]["total"] * 2 ) # assume fp16 weights config["parameters"] = model_size / 2 return config def calc_input_buffer_size(model_config, context: int) -> float: return 4096 + 2048 * model_config["hidden_size"] + context * 4 + context * 2048 def calc_compute_buffer_size(model_config, context: int) -> float: return ( (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024 ) def calc_context_size(model_config, context: int) -> float: n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"] n_embd_gqa = model_config["hidden_size"] / n_gqa n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context) return 2 * n_elements * 2 def calc(model_base, context, quant_size): model_config = get_model_config(model_base) quant_bpw = 0 try: quant_bpw = float(quant_size) except: quant_bpw = quants[quant_size] model_size = round( calc_model_size(model_config["parameters"], quant_bpw) / 1000 / 1000 / 1000, 2 ) context_size = round( ( calc_input_buffer_size(model_config, context) + calc_context_size(model_config, context) + calc_compute_buffer_size(model_config, context) ) / 1000 / 1000 / 1000, 2, ) return model_size, context_size, round(model_size + context_size, 2) title = "GGUF VRAM Calculator" with gr.Blocks(title=title, theme=gr.themes.Monochrome()) as app: default_model = "mistralai/Mistral-7B-v0.1" default_quant = "Q4_K_S" default_context = 8192 default_size = calc(default_model, default_context, default_quant) default_model_size = default_size[0] default_context_size = default_size[1] gr.Markdown( f"# {app.title}\n## This space has been superseeded by the [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator), which has model search built in, and doesn't rely on gradio\nThis is meant only as a guide and is will not be 100% accurate, this also does not account for anything that might be running in the background on your system or CUDA system memory fallback on Windows" ) model = gr.Textbox( value=default_model, label="Enter Unquantized HF Model Name (e.g. mistralai/Mistral-7B-v0.1)", ) context = gr.Number( minimum=1, value=default_context, label="Desired Context Size (Tokens)" ) quant = gr.Dropdown( choices=list(quants.keys()), value=default_quant, allow_custom_value=True, label="Enter GGUF Quant (e.g. Q4_K_S) or the specific BPW for other quantization schemes such as exl2 (e.g. 4.5)", ) btn = gr.Button(value="Submit", variant="primary") btn.click( calc, inputs=[ model, context, quant, ], outputs=[ gr.Number( label="Model Size (GB)", value=default_size[0], ), gr.Number( label="Context Size (GB)", value=default_size[1], ), gr.Number( label="Total Size (GB)", value=default_size[2], ), ], ) app.launch()