Spaces:

nics-efc
/

C2C_demo

Running

App Files Files Community

fuvty commited on Nov 5

Commit

68b944b

1 Parent(s): 7b0d224

[update] support zeroGPU

Browse files

Files changed (2) hide show

app.py +52 -9
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -6,7 +6,10 @@ This creates a web interface to compare three inference modes simultaneously:
 2. T2T: Two-stage inference (shows context + answer)
 3. C2C: Rosetta model with projectors
-All models are loaded at startup and respond to the same input in parallel.
 """
 import os
@@ -19,6 +22,20 @@ from typing import Optional, Generator
 from queue import Queue
 from threading import Thread
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from rosetta.utils.evaluate import load_rosetta_model, load_hf_model, set_default_chat_template
 from rosetta.model.wrapper import RosettaModel
@@ -46,8 +63,13 @@ class ModelManager:
             c2c_checkpoint_path: Path to C2C checkpoint directory
             device: Device to use (cuda, cpu, or auto)
         """
         if device == "auto":
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
             self.device = torch.device(device)
         print(f"Using device: {self.device}")
@@ -208,13 +230,19 @@ class ModelManager:
         return kwargs
     def generate_single(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from single model with streaming."""
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.single_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
-        inputs = self.single_tokenizer(text, return_tensors="pt").to(self.device)
         # Setup streamer
         streamer = TextIteratorStreamer(
@@ -241,8 +269,17 @@ class ModelManager:
             generated_text += token
             yield generated_text
     def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
         """Generate response from T2T model with streaming (returns context, answer)."""
         # Stage 1: Context generation
         context_streamer = TextIteratorStreamer(
             self.t2t_model.context_tokenizer,
@@ -257,7 +294,7 @@ class ModelManager:
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
-        ).to(self.device)
         generation_kwargs = {
             'input_ids': inputs,
@@ -306,7 +343,7 @@ class ModelManager:
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
-        ).to(self.device)
         generation_kwargs = {
             'input_ids': inputs,
@@ -324,13 +361,19 @@ class ModelManager:
             answer_text += token
             yield context_text, answer_text
     def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from C2C model with streaming."""
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.c2c_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
-        inputs = self.c2c_tokenizer(text, return_tensors="pt").to(self.device)
         # Setup streamer
         streamer = TextIteratorStreamer(
@@ -343,12 +386,12 @@ class ModelManager:
         full_length = inputs.input_ids.shape[1]
         instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
             full_length - 1, 1
-        ).unsqueeze(0).to(self.device)
         label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
             1, 1
-        ).unsqueeze(0).to(self.device)
         position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
-                      torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(self.device)
         # Generation parameters
         generation_kwargs = {

 2. T2T: Two-stage inference (shows context + answer)
 3. C2C: Rosetta model with projectors
+ZeroGPU Support:
+- Models are loaded to CPU at startup
+- @spaces.GPU decorator moves models to GPU on-demand for each inference
+- Works seamlessly on both ZeroGPU and regular GPU environments
 """
 import os
 from queue import Queue
 from threading import Thread
+# ZeroGPU support
+try:
+    import spaces
+    ZEROGPU_AVAILABLE = True
+except ImportError:
+    ZEROGPU_AVAILABLE = False
+    # Create a no-op decorator for non-ZeroGPU environments
+    class spaces:
+        @staticmethod
+        def GPU(duration=None):
+            def decorator(func):
+                return func
+            return decorator if duration else lambda f: f
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from rosetta.utils.evaluate import load_rosetta_model, load_hf_model, set_default_chat_template
 from rosetta.model.wrapper import RosettaModel
             c2c_checkpoint_path: Path to C2C checkpoint directory
             device: Device to use (cuda, cpu, or auto)
         """
+        # For ZeroGPU, load models to CPU and move to GPU in decorated functions
         if device == "auto":
+            if ZEROGPU_AVAILABLE:
+                self.device = torch.device("cpu")
+                print("ZeroGPU detected: Loading models to CPU (will move to GPU on-demand)")
+            else:
+                self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
             self.device = torch.device(device)
         print(f"Using device: {self.device}")
         return kwargs
+    @spaces.GPU(duration=60)
     def generate_single(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from single model with streaming."""
+        # Move model to GPU for ZeroGPU
+        device = torch.device("cuda" if ZEROGPU_AVAILABLE else self.device)
+        if ZEROGPU_AVAILABLE and self.single_model.device.type != "cuda":
+            self.single_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.single_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
+        inputs = self.single_tokenizer(text, return_tensors="pt").to(device)
         # Setup streamer
         streamer = TextIteratorStreamer(
             generated_text += token
             yield generated_text
+    @spaces.GPU(duration=90)
     def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
         """Generate response from T2T model with streaming (returns context, answer)."""
+        # Move models to GPU for ZeroGPU
+        device = torch.device("cuda" if ZEROGPU_AVAILABLE else self.device)
+        if ZEROGPU_AVAILABLE:
+            if self.t2t_model.context_model.device.type != "cuda":
+                self.t2t_model.context_model.to(device)
+            if self.t2t_model.answer_model.device.type != "cuda":
+                self.t2t_model.answer_model.to(device)
         # Stage 1: Context generation
         context_streamer = TextIteratorStreamer(
             self.t2t_model.context_tokenizer,
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
+        ).to(device)
         generation_kwargs = {
             'input_ids': inputs,
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
+        ).to(device)
         generation_kwargs = {
             'input_ids': inputs,
             answer_text += token
             yield context_text, answer_text
+    @spaces.GPU(duration=60)
     def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from C2C model with streaming."""
+        # Move model to GPU for ZeroGPU
+        device = torch.device("cuda" if ZEROGPU_AVAILABLE else self.device)
+        if ZEROGPU_AVAILABLE and self.c2c_model.device.type != "cuda":
+            self.c2c_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.c2c_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
+        inputs = self.c2c_tokenizer(text, return_tensors="pt").to(device)
         # Setup streamer
         streamer = TextIteratorStreamer(
         full_length = inputs.input_ids.shape[1]
         instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
             full_length - 1, 1
+        ).unsqueeze(0).to(device)
         label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
             1, 1
+        ).unsqueeze(0).to(device)
         position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
+                      torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(device)
         # Generation parameters
         generation_kwargs = {

requirements.txt CHANGED Viewed

@@ -8,6 +8,9 @@ gradio==5.9.1
 # HuggingFace Hub for checkpoint downloads
 huggingface-hub>=0.26.0
 # Configuration file parsing
 pyyaml>=6.0

 # HuggingFace Hub for checkpoint downloads
 huggingface-hub>=0.26.0
+# ZeroGPU support for HuggingFace Spaces
+spaces>=0.30.0
 # Configuration file parsing
 pyyaml>=6.0