Csplk commited on
Commit
7ab08cb
Β·
verified Β·
1 Parent(s): 347e03a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -13
app.py CHANGED
@@ -6,20 +6,60 @@ from threading import Thread
6
  from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
7
  from PIL import ImageDraw
8
  from torchvision.transforms.v2 import Resize
9
- #import subprocess
10
 
11
- #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
 
13
- #subprocess.run('cp -r moondream/torch clients/python/moondream/torch')
14
- #subprocess.run('pip install moondream[gpu]')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
17
  #model_id = "vikhyatk/moondream2"
18
  #revision = "2025-01-09"
19
 
20
  #def load_moondream():
21
- # """Load Moondream model and tokenizer."""
22
- # model = AutoModelForCausalLM.from_pretrained(
23
  # "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
24
  # )
25
  # tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
@@ -33,14 +73,13 @@ from torchvision.transforms.v2 import Resize
33
 
34
  #moondream.eval()
35
 
36
- """Load Moondream model and tokenizer."""
37
- moondream = AutoModelForCausalLM.from_pretrained(
38
- "vikhyatk/moondream2",
39
- revision="2025-01-09",
40
- trust_remote_code=True,
41
- device_map={"": "cuda"},
42
  )
43
- tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
44
 
45
  @spaces.GPU(durtion="150")
46
  def answer_questions(image_tuples, prompt_text):
@@ -68,6 +107,18 @@ def answer_questions(image_tuples, prompt_text):
68
  #print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
69
  return Q_and_A, result
70
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  with gr.Blocks() as demo:
72
  gr.Markdown("# moondream2 unofficial batch processing demo")
73
  gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
 
6
  from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
7
  from PIL import ImageDraw
8
  from torchvision.transforms.v2 import Resize
 
9
 
10
+ from transformers import AutoModelForCausalLM
11
 
12
+ moondream = AutoModelForCausalLM.from_pretrained(
13
+ "moondream/moondream3-preview",
14
+ trust_remote_code=True,
15
+ dtype=torch.bfloat16,
16
+ device_map={"": "cuda"},
17
+ )
18
+ moondream.compile()
19
+
20
+ # Encode image once
21
+ image = Image.open("complex_scene.jpg")
22
+ encoded = moondream.encode_image(image)
23
+
24
+ # Reuse the encoding for multiple queries
25
+ questions = [
26
+ "How many people are in this image?",
27
+ "What time of day was this taken?",
28
+ "What's the weather like?"
29
+ ]
30
+
31
+ for q in questions:
32
+ result = moondream.query(image=encoded, question=q, reasoning=False)
33
+ print(f"Q: {q}")
34
+ print(f"A: {result['answer']}\n")
35
+
36
+ # Also works with other skills
37
+ caption = moondream.caption(encoded, length="normal")
38
+ objects = moondream.detect(encoded, "poop")
39
+ pointe = moondream.point(encoded, "grass")
40
+ print(f"caption: {e}, objects:{g}, point:{h}")
41
 
42
+ # Segment an object
43
+ result = moondream.segment(image, "cat")
44
+ svg_path = result["path"]
45
+ bbox = result["bbox"]
46
 
47
+ print(f"SVG Path: {svg_path[:100]}...")
48
+ print(f"Bounding box: {bbox}")
49
+
50
+ # With spatial hint (point) to guide segmentation
51
+ result = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])
52
+
53
+ # With spatial hint (bounding box)
54
+ result = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
55
+
56
+ """
57
  #model_id = "vikhyatk/moondream2"
58
  #revision = "2025-01-09"
59
 
60
  #def load_moondream():
61
+ # Load Moondream model and tokenizer.
62
+ # model = AutoModelForCausalLM.from_pretrained(
63
  # "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
64
  # )
65
  # tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 
73
 
74
  #moondream.eval()
75
 
76
+ model = AutoModelForCausalLM.from_pretrained(
77
+ "vikhyatk/moondream2",
78
+ trust_remote_code=True,
79
+ dtype=torch.bfloat16,
80
+ device_map="cuda", # "cuda" on Nvidia GPUs
 
81
  )
82
+ """
83
 
84
  @spaces.GPU(durtion="150")
85
  def answer_questions(image_tuples, prompt_text):
 
107
  #print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
108
  return Q_and_A, result
109
 
110
+ """
111
+ Load Moondream model and tokenizer.
112
+ moondream = AutoModelForCausalLM.from_pretrained(
113
+ "vikhyatk/moondream2",
114
+ revision="2025-01-09",
115
+ trust_remote_code=True,
116
+ device_map={"": "cuda"},
117
+ )
118
+ tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
119
+ """
120
+
121
+
122
  with gr.Blocks() as demo:
123
  gr.Markdown("# moondream2 unofficial batch processing demo")
124
  gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")