Alovestocode commited on
Commit
a6b24a3
·
verified ·
1 Parent(s): 5bc2959

Enable 8-bit quantization option

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-313.pyc +0 -0
  2. app.py +4 -1
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -32,6 +32,7 @@ MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
32
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
33
  DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
34
  USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
 
35
 
36
  MODEL_FALLBACKS = [
37
  "Alovestocode/router-qwen3-32b-merged",
@@ -88,7 +89,9 @@ def get_model() -> AutoModelForCausalLM:
88
  "device_map": "auto",
89
  "trust_remote_code": True,
90
  }
91
- if USE_4BIT:
 
 
92
  kwargs["quantization_config"] = BitsAndBytesConfig(
93
  load_in_4bit=True,
94
  bnb_4bit_compute_dtype=dtype,
 
32
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
33
  DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
34
  USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
35
+ USE_8BIT = os.environ.get("LOAD_IN_8BIT", "0").lower() in {"1", "true", "yes"}
36
 
37
  MODEL_FALLBACKS = [
38
  "Alovestocode/router-qwen3-32b-merged",
 
89
  "device_map": "auto",
90
  "trust_remote_code": True,
91
  }
92
+ if USE_8BIT:
93
+ kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
94
+ elif USE_4BIT:
95
  kwargs["quantization_config"] = BitsAndBytesConfig(
96
  load_in_4bit=True,
97
  bnb_4bit_compute_dtype=dtype,