gpt-oss-120b-chat

Running

App Files Files Community

gpt-oss-120b-chat / app.py

bradnow

Showing thoughts separately and remove thoughts when calling inference API

9e185d2 verified 7 months ago

raw

history blame

3.23 kB

	import os
	import gradio as gr
	from openai import OpenAI

	title = None # "ServiceNow-AI Chat" # modelConfig.get('MODE_DISPLAY_NAME')
	description = None

	model_config = {
	"MODEL_NAME": os.environ.get("MODEL_NAME"),
	"MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
	"MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
	"VLLM_API_URL": os.environ.get("VLLM_API_URL"),
	"AUTH_TOKEN": os.environ.get("AUTH_TOKEN")
	}

	# Initialize the OpenAI client with the vLLM API URL and token
	client = OpenAI(
	api_key=model_config.get('AUTH_TOKEN'),
	base_url=model_config.get('VLLM_API_URL')
	)


	def chat_fn(message, history):
	# Remove any assistant messages with metadata from history
	print(f"Original History: {history}")
	history = [item for item in history if
	not (isinstance(item, dict) and
	item.get("role") == "assistant" and
	isinstance(item.get("metadata"), dict) and
	item.get("metadata", {}).get("title") is not None)]
	print(f"Updated History: {history}")

	messages = history + [{"role": "user", "content": message}]
	print(f"Messages: {messages}")

	# Create the streaming response
	stream = client.chat.completions.create(
	model=model_config.get('MODEL_NAME'),
	messages=messages,
	temperature=0.8,
	stream=True
	)

	history.append(gr.ChatMessage(
	role="assistant",
	content="Thinking...",
	metadata={"title": "🧠 Thought"}
	))

	output = ""
	completion_started = False
	for chunk in stream:
	# Extract the new content from the delta field
	content = getattr(chunk.choices[0].delta, "content", "")
	output += content

	parts = output.split("[BEGIN FINAL RESPONSE]")

	if len(parts) > 1:
	if parts[1].endswith("[END FINAL RESPONSE]"):
	parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
	if parts[1].endswith("[END FINAL RESPONSE]\n<\|end\|>"):
	parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<\|end\|>", "")

	history[-1 if not completion_started else -2] = gr.ChatMessage(
	role="assistant",
	content=parts[0],
	metadata={"title": "🧠 Thought"}
	)
	if completion_started:
	history[-1] = gr.ChatMessage(
	role="assistant",
	content=parts[1]
	)
	elif len(parts) > 1 and not completion_started:
	completion_started = True
	history.append(gr.ChatMessage(
	role="assistant",
	content=parts[1]
	))

	# only yield the most recent assistant messages
	messages_to_yield = history[-1:] if not completion_started else history[-2:]
	yield messages_to_yield


	# Add the model display name and Hugging Face URL to the description
	# description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"

	print(f"Running model {model_config.get('MODE_DISPLAY_NAME')} ({model_config.get('MODEL_NAME')})")

	gr.ChatInterface(
	chat_fn,
	title=title,
	description=description,
	theme=gr.themes.Default(primary_hue="green"),
	type="messages",
	).launch()