WilhelmT commited on
Commit
d91d12e
·
verified ·
1 Parent(s): 23e9529

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -2
README.md CHANGED
@@ -79,6 +79,7 @@ The `embedl-models` package is required, it provides the optimized FlashHead imp
79
  ---
80
 
81
  ## Usage Examples
 
82
 
83
  ### vLLM Inference
84
 
@@ -90,7 +91,7 @@ model_id = "embedl/Llama-3.2-1B-Instruct-FlashHead"
90
 
91
  if __name__ == "__main__":
92
  sampling = SamplingParams(max_tokens=128, temperature=0.0)
93
- llm = LLM(model=model_id, trust_remote_code=True)
94
 
95
  prompt = "Write a haiku about coffee."
96
  output = llm.generate([prompt], sampling)
@@ -113,7 +114,8 @@ model_id = "embedl/Llama-3.2-1B-Instruct-FlashHead"
113
  if __name__ == "__main__":
114
  asyncio.run(
115
  run_repl(
116
- model=model_id
 
117
  )
118
  )
119
  ```
 
79
  ---
80
 
81
  ## Usage Examples
82
+ **Note (vLLM context length):** `max_model_len=131072` may fail on GPUs without enough free VRAM for the KV cache. If you see a KV cache memory error, lower `max_model_len` (or increase `gpu_memory_utilization`).
83
 
84
  ### vLLM Inference
85
 
 
91
 
92
  if __name__ == "__main__":
93
  sampling = SamplingParams(max_tokens=128, temperature=0.0)
94
+ llm = LLM(model=model_id, trust_remote_code=True, max_model_len=131072)
95
 
96
  prompt = "Write a haiku about coffee."
97
  output = llm.generate([prompt], sampling)
 
114
  if __name__ == "__main__":
115
  asyncio.run(
116
  run_repl(
117
+ model=model_id,
118
+ max_model_len=131072
119
  )
120
  )
121
  ```