Spaces:
Running
on
Zero
Running
on
Zero
Enable 8-bit quantization option
Browse files- __pycache__/app.cpython-313.pyc +0 -0
- app.py +4 -1
__pycache__/app.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
|
|
|
app.py
CHANGED
|
@@ -32,6 +32,7 @@ MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
|
|
| 32 |
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
|
| 33 |
DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
|
| 34 |
USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
|
|
|
|
| 35 |
|
| 36 |
MODEL_FALLBACKS = [
|
| 37 |
"Alovestocode/router-qwen3-32b-merged",
|
|
@@ -88,7 +89,9 @@ def get_model() -> AutoModelForCausalLM:
|
|
| 88 |
"device_map": "auto",
|
| 89 |
"trust_remote_code": True,
|
| 90 |
}
|
| 91 |
-
if
|
|
|
|
|
|
|
| 92 |
kwargs["quantization_config"] = BitsAndBytesConfig(
|
| 93 |
load_in_4bit=True,
|
| 94 |
bnb_4bit_compute_dtype=dtype,
|
|
|
|
| 32 |
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
|
| 33 |
DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
|
| 34 |
USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
|
| 35 |
+
USE_8BIT = os.environ.get("LOAD_IN_8BIT", "0").lower() in {"1", "true", "yes"}
|
| 36 |
|
| 37 |
MODEL_FALLBACKS = [
|
| 38 |
"Alovestocode/router-qwen3-32b-merged",
|
|
|
|
| 89 |
"device_map": "auto",
|
| 90 |
"trust_remote_code": True,
|
| 91 |
}
|
| 92 |
+
if USE_8BIT:
|
| 93 |
+
kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
|
| 94 |
+
elif USE_4BIT:
|
| 95 |
kwargs["quantization_config"] = BitsAndBytesConfig(
|
| 96 |
load_in_4bit=True,
|
| 97 |
bnb_4bit_compute_dtype=dtype,
|