21
Browse files- app.py +7 -4
- documentation_gemini/audio_understanding.md +158 -0
- documentation_gemini/code_execution.md +258 -0
- documentation_gemini/document_understanding.md +215 -0
- documentation_gemini/function_calling_with_the_gemini_api.md +674 -0
- documentation_gemini/gemini_thinking.md +260 -0
- documentation_gemini/grounding_with_google_search.md +215 -0
- documentation_gemini/image_generation_with_gemini.md +137 -0
- documentation_gemini/image_understanding.md +352 -0
- documentation_gemini/long_context.md +116 -0
- documentation_gemini/speech_generation_text-to-speech.md +256 -0
- documentation_gemini/structured_output.md +392 -0
- documentation_gemini/text_generation.md +199 -0
- documentation_gemini/url_context.md +198 -0
- documentation_gemini/video_understanding.md +224 -0
- templates/index.html +48 -15
app.py
CHANGED
|
@@ -39,7 +39,7 @@ DEFAULT_TOOLS = [
|
|
| 39 |
conversations = {}
|
| 40 |
conversation_metadata = {}
|
| 41 |
|
| 42 |
-
def add_message_to_history(conversation_id, role, content, has_file=False):
|
| 43 |
"""Ajoute un message à l'historique de la conversation"""
|
| 44 |
if conversation_id not in conversation_metadata:
|
| 45 |
conversation_metadata[conversation_id] = {
|
|
@@ -50,12 +50,15 @@ def add_message_to_history(conversation_id, role, content, has_file=False):
|
|
| 50 |
'status': 'active'
|
| 51 |
}
|
| 52 |
|
| 53 |
-
|
| 54 |
'role': role,
|
| 55 |
'content': content,
|
| 56 |
'timestamp': datetime.now().isoformat(),
|
| 57 |
'hasFile': has_file
|
| 58 |
-
}
|
|
|
|
|
|
|
|
|
|
| 59 |
conversation_metadata[conversation_id]['last_activity'] = datetime.now().isoformat()
|
| 60 |
|
| 61 |
@app.route('/')
|
|
@@ -213,7 +216,7 @@ def chat_with_file():
|
|
| 213 |
display_message = message if message else 'Analyse ce fichier'
|
| 214 |
if file_data:
|
| 215 |
display_message += f" [Fichier: {file_data.get('filename', 'inconnu')}]"
|
| 216 |
-
add_message_to_history(conversation_id, 'user', display_message, has_file=True)
|
| 217 |
|
| 218 |
# Configuration du thinking
|
| 219 |
config_dict = DEFAULT_CONFIG.copy()
|
|
|
|
| 39 |
conversations = {}
|
| 40 |
conversation_metadata = {}
|
| 41 |
|
| 42 |
+
def add_message_to_history(conversation_id, role, content, has_file=False, file_data=None):
|
| 43 |
"""Ajoute un message à l'historique de la conversation"""
|
| 44 |
if conversation_id not in conversation_metadata:
|
| 45 |
conversation_metadata[conversation_id] = {
|
|
|
|
| 50 |
'status': 'active'
|
| 51 |
}
|
| 52 |
|
| 53 |
+
message_data = {
|
| 54 |
'role': role,
|
| 55 |
'content': content,
|
| 56 |
'timestamp': datetime.now().isoformat(),
|
| 57 |
'hasFile': has_file
|
| 58 |
+
}
|
| 59 |
+
if file_data:
|
| 60 |
+
message_data['fileData'] = file_data
|
| 61 |
+
conversation_metadata[conversation_id]['messages'].append(message_data)
|
| 62 |
conversation_metadata[conversation_id]['last_activity'] = datetime.now().isoformat()
|
| 63 |
|
| 64 |
@app.route('/')
|
|
|
|
| 216 |
display_message = message if message else 'Analyse ce fichier'
|
| 217 |
if file_data:
|
| 218 |
display_message += f" [Fichier: {file_data.get('filename', 'inconnu')}]"
|
| 219 |
+
add_message_to_history(conversation_id, 'user', display_message, has_file=True, file_data=file_data)
|
| 220 |
|
| 221 |
# Configuration du thinking
|
| 222 |
config_dict = DEFAULT_CONFIG.copy()
|
documentation_gemini/audio_understanding.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Audio understanding
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/audio>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Gemini can analyze and understand audio input, enabling use cases like the following:
|
| 8 |
+
|
| 9 |
+
* Describe, summarize, or answer questions about audio content.
|
| 10 |
+
* Provide a transcription of the audio.
|
| 11 |
+
* Analyze specific segments of the audio.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
This guide shows you how to use the Gemini API to generate a text response to audio input.
|
| 16 |
+
|
| 17 |
+
### Before you begin
|
| 18 |
+
|
| 19 |
+
Before calling the Gemini API, ensure you have [your SDK of choice](/gemini-api/docs/downloads) installed, and a [Gemini API key](/gemini-api/docs/api-key) configured and ready to use.
|
| 20 |
+
|
| 21 |
+
## Input audio
|
| 22 |
+
|
| 23 |
+
You can provide audio data to Gemini in the following ways:
|
| 24 |
+
|
| 25 |
+
* Upload an audio file before making a request to `generateContent`.
|
| 26 |
+
* Pass inline audio data with the request to `generateContent`.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
### Upload an audio file
|
| 31 |
+
|
| 32 |
+
You can use the [Files API](/gemini-api/docs/files) to upload an audio file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB.
|
| 33 |
+
|
| 34 |
+
The following code uploads an audio file and then uses the file in a call to `generateContent`.
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
from google import genai
|
| 38 |
+
|
| 39 |
+
client = genai.Client()
|
| 40 |
+
|
| 41 |
+
myfile = client.files.upload(file="path/to/sample.mp3")
|
| 42 |
+
|
| 43 |
+
response = client.models.generate_content(
|
| 44 |
+
model="gemini-2.5-flash", contents=["Describe this audio clip", myfile]
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
print(response.text)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
To learn more about working with media files, see [Files API](/gemini-api/docs/files).
|
| 51 |
+
|
| 52 |
+
### Pass audio data inline
|
| 53 |
+
|
| 54 |
+
Instead of uploading an audio file, you can pass inline audio data in the request to `generateContent`:
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
from google.genai import types
|
| 58 |
+
|
| 59 |
+
with open('path/to/small-sample.mp3', 'rb') as f:
|
| 60 |
+
audio_bytes = f.read()
|
| 61 |
+
|
| 62 |
+
response = client.models.generate_content(
|
| 63 |
+
model='gemini-2.5-flash',
|
| 64 |
+
contents=[
|
| 65 |
+
'Describe this audio clip',
|
| 66 |
+
types.Part.from_bytes(
|
| 67 |
+
data=audio_bytes,
|
| 68 |
+
mime_type='audio/mp3',
|
| 69 |
+
)
|
| 70 |
+
]
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
print(response.text)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
A few things to keep in mind about inline audio data:
|
| 77 |
+
|
| 78 |
+
* The maximum request size is 20 MB, which includes text prompts, system instructions, and files provided inline. If your file's size will make the _total request size_ exceed 20 MB, then use the Files API to upload an audio file for use in the request.
|
| 79 |
+
* If you're using an audio sample multiple times, it's more efficient to upload an audio file.
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
## Get a transcript
|
| 84 |
+
|
| 85 |
+
To get a transcript of audio data, just ask for it in the prompt:
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
myfile = client.files.upload(file='path/to/sample.mp3')
|
| 89 |
+
prompt = 'Generate a transcript of the speech.'
|
| 90 |
+
|
| 91 |
+
response = client.models.generate_content(
|
| 92 |
+
model='gemini-2.5-flash',
|
| 93 |
+
contents=[prompt, myfile]
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
print(response.text)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
## Refer to timestamps
|
| 100 |
+
|
| 101 |
+
You can refer to specific sections of an audio file using timestamps of the form `MM:SS`. For example, the following prompt requests a transcript that
|
| 102 |
+
|
| 103 |
+
* Starts at 2 minutes 30 seconds from the beginning of the file.
|
| 104 |
+
* Ends at 3 minutes 29 seconds from the beginning of the file.
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# Create a prompt containing timestamps.
|
| 111 |
+
prompt = "Provide a transcript of the speech from 02:30 to 03:29."
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
## Count tokens
|
| 115 |
+
|
| 116 |
+
Call the `countTokens` method to get a count of the number of tokens in an audio file. For example:
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
response = client.models.count_tokens(
|
| 120 |
+
model='gemini-2.5-flash',
|
| 121 |
+
contents=[myfile]
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
print(response)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
## Supported audio formats
|
| 128 |
+
|
| 129 |
+
Gemini supports the following audio format MIME types:
|
| 130 |
+
|
| 131 |
+
* WAV - `audio/wav`
|
| 132 |
+
* MP3 - `audio/mp3`
|
| 133 |
+
* AIFF - `audio/aiff`
|
| 134 |
+
* AAC - `audio/aac`
|
| 135 |
+
* OGG Vorbis - `audio/ogg`
|
| 136 |
+
* FLAC - `audio/flac`
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Technical details about audio
|
| 141 |
+
|
| 142 |
+
* Gemini represents each second of audio as 32 tokens; for example, one minute of audio is represented as 1,920 tokens.
|
| 143 |
+
* Gemini can "understand" non-speech components, such as birdsong or sirens.
|
| 144 |
+
* The maximum supported length of audio data in a single prompt is 9.5 hours. Gemini doesn't limit the _number_ of audio files in a single prompt; however, the total combined length of all audio files in a single prompt can't exceed 9.5 hours.
|
| 145 |
+
* Gemini downsamples audio files to a 16 Kbps data resolution.
|
| 146 |
+
* If the audio source contains multiple channels, Gemini combines those channels into a single channel.
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
## What's next
|
| 151 |
+
|
| 152 |
+
This guide shows how to generate text in response to audio data. To learn more, see the following resources:
|
| 153 |
+
|
| 154 |
+
* [File prompting strategies](/gemini-api/docs/files#prompt-guide): The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting.
|
| 155 |
+
* [System instructions](/gemini-api/docs/text-generation#system-instructions): System instructions let you steer the behavior of the model based on your specific needs and use cases.
|
| 156 |
+
* [Safety guidance](/gemini-api/docs/safety-guidance): Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs.
|
| 157 |
+
|
| 158 |
+
|
documentation_gemini/code_execution.md
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code execution
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/code-execution>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
The Gemini API provides a code execution tool that enables the model to generate and run Python code. The model can then learn iteratively from the code execution results until it arrives at a final output. You can use code execution to build applications that benefit from code-based reasoning. For example, you can use code execution to solve equations or process text. You can also use the libraries included in the code execution environment to perform more specialized tasks.
|
| 8 |
+
|
| 9 |
+
Gemini is only able to execute code in Python. You can still ask Gemini to generate code in another language, but the model can't use the code execution tool to run it.
|
| 10 |
+
|
| 11 |
+
## Enable code execution
|
| 12 |
+
|
| 13 |
+
To enable code execution, configure the code execution tool on the model. This allows the model to generate and run code.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from google import genai
|
| 17 |
+
from google.genai import types
|
| 18 |
+
|
| 19 |
+
client = genai.Client()
|
| 20 |
+
|
| 21 |
+
response = client.models.generate_content(
|
| 22 |
+
model="gemini-2.5-flash",
|
| 23 |
+
contents="What is the sum of the first 50 prime numbers? "
|
| 24 |
+
"Generate and run code for the calculation, and make sure you get all 50.",
|
| 25 |
+
config=types.GenerateContentConfig(
|
| 26 |
+
tools=[types.Tool(code_execution=types.ToolCodeExecution)]
|
| 27 |
+
),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
for part in response.candidates[0].content.parts:
|
| 31 |
+
if part.text is not None:
|
| 32 |
+
print(part.text)
|
| 33 |
+
if part.executable_code is not None:
|
| 34 |
+
print(part.executable_code.code)
|
| 35 |
+
if part.code_execution_result is not None:
|
| 36 |
+
print(part.code_execution_result.output)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
The output might look something like the following, which has been formatted for readability:
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
Okay, I need to calculate the sum of the first 50 prime numbers. Here's how I'll
|
| 43 |
+
approach this:
|
| 44 |
+
|
| 45 |
+
1. **Generate Prime Numbers:** I'll use an iterative method to find prime
|
| 46 |
+
numbers. I'll start with 2 and check if each subsequent number is divisible
|
| 47 |
+
by any number between 2 and its square root. If not, it's a prime.
|
| 48 |
+
2. **Store Primes:** I'll store the prime numbers in a list until I have 50 of
|
| 49 |
+
them.
|
| 50 |
+
3. **Calculate the Sum:** Finally, I'll sum the prime numbers in the list.
|
| 51 |
+
|
| 52 |
+
Here's the Python code to do this:
|
| 53 |
+
|
| 54 |
+
def is_prime(n):
|
| 55 |
+
"""Efficiently checks if a number is prime."""
|
| 56 |
+
if n <= 1:
|
| 57 |
+
return False
|
| 58 |
+
if n <= 3:
|
| 59 |
+
return True
|
| 60 |
+
if n % 2 == 0 or n % 3 == 0:
|
| 61 |
+
return False
|
| 62 |
+
i = 5
|
| 63 |
+
while i * i <= n:
|
| 64 |
+
if n % i == 0 or n % (i + 2) == 0:
|
| 65 |
+
return False
|
| 66 |
+
i += 6
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
primes = []
|
| 70 |
+
num = 2
|
| 71 |
+
while len(primes) < 50:
|
| 72 |
+
if is_prime(num):
|
| 73 |
+
primes.append(num)
|
| 74 |
+
num += 1
|
| 75 |
+
|
| 76 |
+
sum_of_primes = sum(primes)
|
| 77 |
+
print(f'{primes=}')
|
| 78 |
+
print(f'{sum_of_primes=}')
|
| 79 |
+
|
| 80 |
+
primes=[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67,
|
| 81 |
+
71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
|
| 82 |
+
157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229]
|
| 83 |
+
sum_of_primes=5117
|
| 84 |
+
|
| 85 |
+
The sum of the first 50 prime numbers is 5117.
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
This output combines several content parts that the model returns when using code execution:
|
| 89 |
+
|
| 90 |
+
* `text`: Inline text generated by the model
|
| 91 |
+
* `executableCode`: Code generated by the model that is meant to be executed
|
| 92 |
+
* `codeExecutionResult`: Result of the executable code
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
The naming conventions for these parts vary by programming language.
|
| 97 |
+
|
| 98 |
+
## Use code execution in chat
|
| 99 |
+
|
| 100 |
+
You can also use code execution as part of a chat.
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
from google import genai
|
| 104 |
+
from google.genai import types
|
| 105 |
+
|
| 106 |
+
client = genai.Client()
|
| 107 |
+
|
| 108 |
+
chat = client.chats.create(
|
| 109 |
+
model="gemini-2.5-flash",
|
| 110 |
+
config=types.GenerateContentConfig(
|
| 111 |
+
tools=[types.Tool(code_execution=types.ToolCodeExecution)]
|
| 112 |
+
),
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
response = chat.send_message("I have a math question for you.")
|
| 116 |
+
print(response.text)
|
| 117 |
+
|
| 118 |
+
response = chat.send_message(
|
| 119 |
+
"What is the sum of the first 50 prime numbers? "
|
| 120 |
+
"Generate and run code for the calculation, and make sure you get all 50."
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
for part in response.candidates[0].content.parts:
|
| 124 |
+
if part.text is not None:
|
| 125 |
+
print(part.text)
|
| 126 |
+
if part.executable_code is not None:
|
| 127 |
+
print(part.executable_code.code)
|
| 128 |
+
if part.code_execution_result is not None:
|
| 129 |
+
print(part.code_execution_result.output)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
## Input/output (I/O)
|
| 133 |
+
|
| 134 |
+
Starting with [Gemini 2.0 Flash](/gemini-api/docs/models/gemini#gemini-2.0-flash), code execution supports file input and graph output. Using these input and output capabilities, you can upload CSV and text files, ask questions about the files, and have [Matplotlib](https://matplotlib.org/) graphs generated as part of the response. The output files are returned as inline images in the response.
|
| 135 |
+
|
| 136 |
+
### I/O pricing
|
| 137 |
+
|
| 138 |
+
When using code execution I/O, you're charged for input tokens and output tokens:
|
| 139 |
+
|
| 140 |
+
**Input tokens:**
|
| 141 |
+
|
| 142 |
+
* User prompt
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
**Output tokens:**
|
| 147 |
+
|
| 148 |
+
* Code generated by the model
|
| 149 |
+
* Code execution output in the code environment
|
| 150 |
+
* Thinking tokens
|
| 151 |
+
* Summary generated by the model
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
### I/O details
|
| 156 |
+
|
| 157 |
+
When you're working with code execution I/O, be aware of the following technical details:
|
| 158 |
+
|
| 159 |
+
* The maximum runtime of the code environment is 30 seconds.
|
| 160 |
+
* If the code environment generates an error, the model may decide to regenerate the code output. This can happen up to 5 times.
|
| 161 |
+
* The maximum file input size is limited by the model token window. In AI Studio, using Gemini Flash 2.0, the maximum input file size is 1 million tokens (roughly 2MB for text files of the supported input types). If you upload a file that's too large, AI Studio won't let you send it.
|
| 162 |
+
* Code execution works best with text and CSV files.
|
| 163 |
+
* The input file can be passed in `part.inlineData` or `part.fileData` (uploaded via the [Files API](/gemini-api/docs/files)), and the output file is always returned as `part.inlineData`.
|
| 164 |
+
|
| 165 |
+
| Single turn | Bidirectional (Multimodal Live API)
|
| 166 |
+
---|---|---
|
| 167 |
+
Models supported | All Gemini 2.0 and 2.5 models | Only Flash experimental models
|
| 168 |
+
File input types supported | .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts | .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts
|
| 169 |
+
Plotting libraries supported | Matplotlib, seaborn | Matplotlib, seaborn
|
| 170 |
+
[Multi-tool use](/gemini-api/docs/function-calling#multi-tool-use) | Yes (code execution + grounding only) | Yes
|
| 171 |
+
|
| 172 |
+
## Billing
|
| 173 |
+
|
| 174 |
+
There's no additional charge for enabling code execution from the Gemini API. You'll be billed at the current rate of input and output tokens based on the Gemini model you're using.
|
| 175 |
+
|
| 176 |
+
Here are a few other things to know about billing for code execution:
|
| 177 |
+
|
| 178 |
+
* You're only billed once for the input tokens you pass to the model, and you're billed for the final output tokens returned to you by the model.
|
| 179 |
+
* Tokens representing generated code are counted as output tokens. Generated code can include text and multimodal output like images.
|
| 180 |
+
* Code execution results are also counted as output tokens.
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
The billing model is shown in the following diagram:
|
| 185 |
+
|
| 186 |
+

|
| 187 |
+
|
| 188 |
+
* You're billed at the current rate of input and output tokens based on the Gemini model you're using.
|
| 189 |
+
* If Gemini uses code execution when generating your response, the original prompt, the generated code, and the result of the executed code are labeled _intermediate tokens_ and are billed as _input tokens_.
|
| 190 |
+
* Gemini then generates a summary and returns the generated code, the result of the executed code, and the final summary. These are billed as _output tokens_.
|
| 191 |
+
* The Gemini API includes an intermediate token count in the API response, so you know why you're getting additional input tokens beyond your initial prompt.
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
## Limitations
|
| 196 |
+
|
| 197 |
+
* The model can only generate and execute code. It can't return other artifacts like media files.
|
| 198 |
+
* In some cases, enabling code execution can lead to regressions in other areas of model output (for example, writing a story).
|
| 199 |
+
* There is some variation in the ability of the different models to use code execution successfully.
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
## Supported libraries
|
| 204 |
+
|
| 205 |
+
The code execution environment includes the following libraries:
|
| 206 |
+
|
| 207 |
+
* attrs
|
| 208 |
+
* chess
|
| 209 |
+
* contourpy
|
| 210 |
+
* fpdf
|
| 211 |
+
* geopandas
|
| 212 |
+
* imageio
|
| 213 |
+
* jinja2
|
| 214 |
+
* joblib
|
| 215 |
+
* jsonschema
|
| 216 |
+
* jsonschema-specifications
|
| 217 |
+
* lxml
|
| 218 |
+
* matplotlib
|
| 219 |
+
* mpmath
|
| 220 |
+
* numpy
|
| 221 |
+
* opencv-python
|
| 222 |
+
* openpyxl
|
| 223 |
+
* packaging
|
| 224 |
+
* pandas
|
| 225 |
+
* pillow
|
| 226 |
+
* protobuf
|
| 227 |
+
* pylatex
|
| 228 |
+
* pyparsing
|
| 229 |
+
* PyPDF2
|
| 230 |
+
* python-dateutil
|
| 231 |
+
* python-docx
|
| 232 |
+
* python-pptx
|
| 233 |
+
* reportlab
|
| 234 |
+
* scikit-learn
|
| 235 |
+
* scipy
|
| 236 |
+
* seaborn
|
| 237 |
+
* six
|
| 238 |
+
* striprtf
|
| 239 |
+
* sympy
|
| 240 |
+
* tabulate
|
| 241 |
+
* tensorflow
|
| 242 |
+
* toolz
|
| 243 |
+
* xlrd
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
You can't install your own libraries.
|
| 248 |
+
|
| 249 |
+
**Note:** Only `matplotlib` is supported for graph rendering using code execution.
|
| 250 |
+
|
| 251 |
+
## What's next
|
| 252 |
+
|
| 253 |
+
* Try the [code execution Colab](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Code_Execution.ipynb).
|
| 254 |
+
* Learn about other Gemini API tools:
|
| 255 |
+
* [Function calling](/gemini-api/docs/function-calling)
|
| 256 |
+
* [Grounding with Google Search](/gemini-api/docs/grounding)
|
| 257 |
+
|
| 258 |
+
|
documentation_gemini/document_understanding.md
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Document understanding
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/document-processing>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Gemini models can process documents in PDF format, using native vision to understand entire document contexts. This goes beyond simple text extraction, allowing Gemini to:
|
| 8 |
+
|
| 9 |
+
* Analyze and interpret content, including text, images, diagrams, charts, and tables, even in long documents up to 1000 pages.
|
| 10 |
+
* Extract information into [structured output](/gemini-api/docs/structured-output) formats.
|
| 11 |
+
* Summarize and answer questions based on both the visual and textual elements in a document.
|
| 12 |
+
* Transcribe document content (e.g. to HTML), preserving layouts and formatting, for use in downstream applications.
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
## Passing inline PDF data
|
| 17 |
+
|
| 18 |
+
You can pass inline PDF data in the request to `generateContent`. For PDF payloads under 20MB, you can choose between uploading base64 encoded documents or directly uploading locally stored files.
|
| 19 |
+
|
| 20 |
+
The following example shows you how to fetch a PDF from a URL and convert it to bytes for processing:
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
from google import genai
|
| 24 |
+
from google.genai import types
|
| 25 |
+
import httpx
|
| 26 |
+
|
| 27 |
+
client = genai.Client()
|
| 28 |
+
|
| 29 |
+
doc_url = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf"
|
| 30 |
+
|
| 31 |
+
# Retrieve and encode the PDF byte
|
| 32 |
+
doc_data = httpx.get(doc_url).content
|
| 33 |
+
|
| 34 |
+
prompt = "Summarize this document"
|
| 35 |
+
response = client.models.generate_content(
|
| 36 |
+
model="gemini-2.5-flash",
|
| 37 |
+
contents=[
|
| 38 |
+
types.Part.from_bytes(
|
| 39 |
+
data=doc_data,
|
| 40 |
+
mime_type='application/pdf',
|
| 41 |
+
),
|
| 42 |
+
prompt])
|
| 43 |
+
print(response.text)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
You can also read a PDF from a local file for processing:
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
from google import genai
|
| 50 |
+
from google.genai import types
|
| 51 |
+
import pathlib
|
| 52 |
+
|
| 53 |
+
client = genai.Client()
|
| 54 |
+
|
| 55 |
+
# Retrieve and encode the PDF byte
|
| 56 |
+
filepath = pathlib.Path('file.pdf')
|
| 57 |
+
|
| 58 |
+
prompt = "Summarize this document"
|
| 59 |
+
response = client.models.generate_content(
|
| 60 |
+
model="gemini-2.5-flash",
|
| 61 |
+
contents=[
|
| 62 |
+
types.Part.from_bytes(
|
| 63 |
+
data=filepath.read_bytes(),
|
| 64 |
+
mime_type='application/pdf',
|
| 65 |
+
),
|
| 66 |
+
prompt])
|
| 67 |
+
print(response.text)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
## Uploading PDFs using the File API
|
| 71 |
+
|
| 72 |
+
You can use the [File API](/gemini-api/docs/files) to upload larger documents. Always use the File API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20MB.
|
| 73 |
+
|
| 74 |
+
**Note:** The [File API](/gemini-api/docs/files) lets you store up to 50MB of PDF files. Files are stored for 48 hours. You can access them in that period with your API key, but you can't download them from the API. The File API is available at no cost in all regions where the Gemini API is available.
|
| 75 |
+
|
| 76 |
+
Call [`media.upload`](/api/rest/v1beta/media/upload) to upload a file using the File API. The following code uploads a document file and then uses the file in a call to [`models.generateContent`](/api/generate-content#method:-models.generatecontent).
|
| 77 |
+
|
| 78 |
+
### Large PDFs from URLs
|
| 79 |
+
|
| 80 |
+
Use the File API to simplify uploading and processing large PDF files from URLs:
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
from google import genai
|
| 84 |
+
from google.genai import types
|
| 85 |
+
import io
|
| 86 |
+
import httpx
|
| 87 |
+
|
| 88 |
+
client = genai.Client()
|
| 89 |
+
|
| 90 |
+
long_context_pdf_path = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf"
|
| 91 |
+
|
| 92 |
+
# Retrieve and upload the PDF using the File API
|
| 93 |
+
doc_io = io.BytesIO(httpx.get(long_context_pdf_path).content)
|
| 94 |
+
|
| 95 |
+
sample_doc = client.files.upload(
|
| 96 |
+
# You can pass a path or a file-like object here
|
| 97 |
+
file=doc_io,
|
| 98 |
+
config=dict(
|
| 99 |
+
mime_type='application/pdf')
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
prompt = "Summarize this document"
|
| 103 |
+
|
| 104 |
+
response = client.models.generate_content(
|
| 105 |
+
model="gemini-2.5-flash",
|
| 106 |
+
contents=[sample_doc, prompt])
|
| 107 |
+
print(response.text)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
### Large PDFs stored locally
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
from google import genai
|
| 114 |
+
from google.genai import types
|
| 115 |
+
import pathlib
|
| 116 |
+
import httpx
|
| 117 |
+
|
| 118 |
+
client = genai.Client()
|
| 119 |
+
|
| 120 |
+
# Retrieve and encode the PDF byte
|
| 121 |
+
file_path = pathlib.Path('large_file.pdf')
|
| 122 |
+
|
| 123 |
+
# Upload the PDF using the File API
|
| 124 |
+
sample_file = client.files.upload(
|
| 125 |
+
file=file_path,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
prompt="Summarize this document"
|
| 129 |
+
|
| 130 |
+
response = client.models.generate_content(
|
| 131 |
+
model="gemini-2.5-flash",
|
| 132 |
+
contents=[sample_file, "Summarize this document"])
|
| 133 |
+
print(response.text)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
You can verify the API successfully stored the uploaded file and get its metadata by calling [`files.get`](/api/rest/v1beta/files/get). Only the `name` (and by extension, the `uri`) are unique.
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
from google import genai
|
| 140 |
+
import pathlib
|
| 141 |
+
|
| 142 |
+
client = genai.Client()
|
| 143 |
+
|
| 144 |
+
fpath = pathlib.Path('example.txt')
|
| 145 |
+
fpath.write_text('hello')
|
| 146 |
+
|
| 147 |
+
file = client.files.upload(file='example.txt')
|
| 148 |
+
|
| 149 |
+
file_info = client.files.get(name=file.name)
|
| 150 |
+
print(file_info.model_dump_json(indent=4))
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
## Passing multiple PDFs
|
| 154 |
+
|
| 155 |
+
The Gemini API is capable of processing multiple PDF documents (up to 1000 pages) in a single request, as long as the combined size of the documents and the text prompt stays within the model's context window.
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
from google import genai
|
| 159 |
+
import io
|
| 160 |
+
import httpx
|
| 161 |
+
|
| 162 |
+
client = genai.Client()
|
| 163 |
+
|
| 164 |
+
doc_url_1 = "https://arxiv.org/pdf/2312.11805"
|
| 165 |
+
doc_url_2 = "https://arxiv.org/pdf/2403.05530"
|
| 166 |
+
|
| 167 |
+
# Retrieve and upload both PDFs using the File API
|
| 168 |
+
doc_data_1 = io.BytesIO(httpx.get(doc_url_1).content)
|
| 169 |
+
doc_data_2 = io.BytesIO(httpx.get(doc_url_2).content)
|
| 170 |
+
|
| 171 |
+
sample_pdf_1 = client.files.upload(
|
| 172 |
+
file=doc_data_1,
|
| 173 |
+
config=dict(mime_type='application/pdf')
|
| 174 |
+
)
|
| 175 |
+
sample_pdf_2 = client.files.upload(
|
| 176 |
+
file=doc_data_2,
|
| 177 |
+
config=dict(mime_type='application/pdf')
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
prompt = "What is the difference between each of the main benchmarks between these two papers? Output these in a table."
|
| 181 |
+
|
| 182 |
+
response = client.models.generate_content(
|
| 183 |
+
model="gemini-2.5-flash",
|
| 184 |
+
contents=[sample_pdf_1, sample_pdf_2, prompt])
|
| 185 |
+
print(response.text)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
## Technical details
|
| 189 |
+
|
| 190 |
+
Gemini supports a maximum of 1,000 document pages. Each document page is equivalent to 258 tokens.
|
| 191 |
+
|
| 192 |
+
While there are no specific limits to the number of pixels in a document besides the model's [context window](/gemini-api/docs/long-context), larger pages are scaled down to a maximum resolution of 3072x3072 while preserving their original aspect ratio, while smaller pages are scaled up to 768x768 pixels. There is no cost reduction for pages at lower sizes, other than bandwidth, or performance improvement for pages at higher resolution.
|
| 193 |
+
|
| 194 |
+
### Document types
|
| 195 |
+
|
| 196 |
+
Technically, you can pass other MIME types for document understanding, like TXT, Markdown, HTML, XML, etc. However, document vision **_only meaningfully understands PDFs_**. Other types will be extracted as pure text, and the model won't be able to interpret what we see in the rendering of those files. Any file-type specifics like charts, diagrams, HTML tags, Markdown formatting, etc., will be lost.
|
| 197 |
+
|
| 198 |
+
### Best practices
|
| 199 |
+
|
| 200 |
+
For best results:
|
| 201 |
+
|
| 202 |
+
* Rotate pages to the correct orientation before uploading.
|
| 203 |
+
* Avoid blurry pages.
|
| 204 |
+
* If using a single page, place the text prompt after the page.
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
## What's next
|
| 209 |
+
|
| 210 |
+
To learn more, see the following resources:
|
| 211 |
+
|
| 212 |
+
* [File prompting strategies](/gemini-api/docs/files#prompt-guide): The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting.
|
| 213 |
+
* [System instructions](/gemini-api/docs/text-generation#system-instructions): System instructions let you steer the behavior of the model based on your specific needs and use cases.
|
| 214 |
+
|
| 215 |
+
|
documentation_gemini/function_calling_with_the_gemini_api.md
ADDED
|
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Function calling with the Gemini API
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/function-calling>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases:
|
| 8 |
+
|
| 9 |
+
* **Augment Knowledge:** Access information from external sources like databases, APIs, and knowledge bases.
|
| 10 |
+
* **Extend Capabilities:** Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts.
|
| 11 |
+
* **Take Actions:** Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
Get Weather Schedule Meeting Create Chart
|
| 16 |
+
|
| 17 |
+
## How function calling works
|
| 18 |
+
|
| 19 |
+

|
| 21 |
+
|
| 22 |
+
Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process:
|
| 23 |
+
|
| 24 |
+
1. **Define Function Declaration:** Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model.
|
| 25 |
+
2. **Call LLM with function declarations:** Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object.
|
| 26 |
+
3. **Execute Function Code (Your Responsibility):** The Model _does not_ execute the function itself. It's your application's responsibility to process the response and check for Function Call, if
|
| 27 |
+
* **Yes** : Extract the name and args of the function and execute the corresponding function in your application.
|
| 28 |
+
* **No:** The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome).
|
| 29 |
+
4. **Create User friendly response:** If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call.
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ([parallel function calling](/gemini-api/docs/function-calling#parallel_function_calling)) and in sequence ([compositional function calling](/gemini-api/docs/function-calling#compositional_function_calling)).
|
| 34 |
+
|
| 35 |
+
### Step 1: Define a function declaration
|
| 36 |
+
|
| 37 |
+
Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs.
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Define a function that the model can call to control smart lights
|
| 41 |
+
set_light_values_declaration = {
|
| 42 |
+
"name": "set_light_values",
|
| 43 |
+
"description": "Sets the brightness and color temperature of a light.",
|
| 44 |
+
"parameters": {
|
| 45 |
+
"type": "object",
|
| 46 |
+
"properties": {
|
| 47 |
+
"brightness": {
|
| 48 |
+
"type": "integer",
|
| 49 |
+
"description": "Light level from 0 to 100. Zero is off and 100 is full brightness",
|
| 50 |
+
},
|
| 51 |
+
"color_temp": {
|
| 52 |
+
"type": "string",
|
| 53 |
+
"enum": ["daylight", "cool", "warm"],
|
| 54 |
+
"description": "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.",
|
| 55 |
+
},
|
| 56 |
+
},
|
| 57 |
+
"required": ["brightness", "color_temp"],
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# This is the actual function that would be called based on the model's suggestion
|
| 62 |
+
def set_light_values(brightness: int, color_temp: str) -> dict[str, int | str]:
|
| 63 |
+
"""Set the brightness and color temperature of a room light. (mock API).
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
brightness: Light level from 0 to 100. Zero is off and 100 is full brightness
|
| 67 |
+
color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
A dictionary containing the set brightness and color temperature.
|
| 71 |
+
"""
|
| 72 |
+
return {"brightness": brightness, "colorTemperature": color_temp}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
### Step 2: Call the model with function declarations
|
| 76 |
+
|
| 77 |
+
Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion.
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
from google.genai import types
|
| 81 |
+
|
| 82 |
+
# Configure the client and tools
|
| 83 |
+
client = genai.Client()
|
| 84 |
+
tools = types.Tool(function_declarations=[set_light_values_declaration])
|
| 85 |
+
config = types.GenerateContentConfig(tools=[tools])
|
| 86 |
+
|
| 87 |
+
# Define user prompt
|
| 88 |
+
contents = [
|
| 89 |
+
types.Content(
|
| 90 |
+
role="user", parts=[types.Part(text="Turn the lights down to a romantic level")]
|
| 91 |
+
)
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
# Send request with function declarations
|
| 95 |
+
response = client.models.generate_content(
|
| 96 |
+
model="gemini-2.5-flash",
|
| 97 |
+
contents=contents
|
| 98 |
+
config=config,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
print(response.candidates[0].content.parts[0].function_call)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
The model then returns a `functionCall` object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question.
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
id=None args={'color_temp': 'warm', 'brightness': 25} name='set_light_values'
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
### Step 3: Execute set_light_values function code
|
| 111 |
+
|
| 112 |
+
Extract the function call details from the model's response, parse the arguments , and execute the `set_light_values` function.
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Extract tool call details, it may not be in the first part.
|
| 116 |
+
tool_call = response.candidates[0].content.parts[0].function_call
|
| 117 |
+
|
| 118 |
+
if tool_call.name == "set_light_values":
|
| 119 |
+
result = set_light_values(**tool_call.args)
|
| 120 |
+
print(f"Function execution result: {result}")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
### Step 4: Create user friendly response with function result and call the model again
|
| 124 |
+
|
| 125 |
+
Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user.
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# Create a function response part
|
| 129 |
+
function_response_part = types.Part.from_function_response(
|
| 130 |
+
name=tool_call.name,
|
| 131 |
+
response={"result": result},
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Append function call and result of the function execution to contents
|
| 135 |
+
contents.append(response.candidates[0].content) # Append the content from the model's response.
|
| 136 |
+
contents.append(types.Content(role="user", parts=[function_response_part])) # Append the function response
|
| 137 |
+
|
| 138 |
+
final_response = client.models.generate_content(
|
| 139 |
+
model="gemini-2.5-flash",
|
| 140 |
+
config=config,
|
| 141 |
+
contents=contents,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
print(final_response.text)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
This completes the function calling flow. The model successfully used the `set_light_values` function to perform the request action of the user.
|
| 148 |
+
|
| 149 |
+
## Function declarations
|
| 150 |
+
|
| 151 |
+
When you implement function calling in a prompt, you create a `tools` object, which contains one or more `function declarations`. You define functions using JSON, specifically with a [select subset](https://ai.google.dev/api/caching#Schema) of the [OpenAPI schema](https://spec.openapis.org/oas/v3.0.3#schemaw) format. A single function declaration can include the following parameters:
|
| 152 |
+
|
| 153 |
+
* `name` (string): A unique name for the function (`get_weather_forecast`, `send_email`). Use descriptive names without spaces or special characters (use underscores or camelCase).
|
| 154 |
+
* `description` (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters.").
|
| 155 |
+
* `parameters` (object): Defines the input parameters the function expects.
|
| 156 |
+
* `type` (string): Specifies the overall data type, such as `object`.
|
| 157 |
+
* `properties` (object): Lists individual parameters, each with:
|
| 158 |
+
* `type` (string): The data type of the parameter, such as `string`, `integer`, `boolean, array`.
|
| 159 |
+
* `description` (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'.").
|
| 160 |
+
* `enum` (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]).
|
| 161 |
+
* `required` (array): An array of strings listing the parameter names that are mandatory for the function to operate.
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
You can also construct FunctionDeclarations from Python functions directly using `types.FunctionDeclaration.from_callable(client=client, callable=your_function)`.
|
| 166 |
+
|
| 167 |
+
## Function calling with thinking
|
| 168 |
+
|
| 169 |
+
Enabling "[thinking](/gemini-api/docs/thinking)" can improve function call performance by allowing the model to reason through a request before suggesting function calls. The Gemini API is stateless, the model's reasoning context will be lost between turns in a multi-turn conversation. To preserve this context, you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns.
|
| 170 |
+
|
| 171 |
+
The [standard pattern for multi-turn tool](/gemini-api/docs/function-calling?example=weather#step-4) use is to append the model's complete previous response to the conversation history. The `content` object includes the `thought_signatures` automatically. If you follow this pattern **No code changes are required**.
|
| 172 |
+
|
| 173 |
+
### Manually managing thought signatures
|
| 174 |
+
|
| 175 |
+
If you modify the conversation history manually—instead of sending the complete previous response and want to benefit from thinking you must correctly handle the `thought_signature` included in the model's turn.
|
| 176 |
+
|
| 177 |
+
Follow these rules to ensure the model's context is preserved:
|
| 178 |
+
|
| 179 |
+
* Always send the `thought_signature` back to the model inside its original `Part`.
|
| 180 |
+
* Don't merge a `Part` containing a signature with one that does not. This breaks the positional context of the thought.
|
| 181 |
+
* Don't combine two `Parts` that both contain signatures, as the signature strings cannot be merged.
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
### Inspecting Thought Signatures
|
| 186 |
+
|
| 187 |
+
While not necessary for implementation, you can inspect the response to see the `thought_signature` for debugging or educational purposes.
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
import base64
|
| 191 |
+
# After receiving a response from a model with thinking enabled
|
| 192 |
+
# response = client.models.generate_content(...)
|
| 193 |
+
|
| 194 |
+
# The signature is attached to the response part containing the function call
|
| 195 |
+
part = response.candidates[0].content.parts[0]
|
| 196 |
+
if part.thought_signature:
|
| 197 |
+
print(base64.b64encode(part.thought_signature).decode("utf-8"))
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
Learn more about limitations and usage of thought signatures, and about thinking models in general, on the [Thinking](/gemini-api/docs/thinking#signatures) page.
|
| 201 |
+
|
| 202 |
+
## Parallel function calling
|
| 203 |
+
|
| 204 |
+
In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco.
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
power_disco_ball = {
|
| 208 |
+
"name": "power_disco_ball",
|
| 209 |
+
"description": "Powers the spinning disco ball.",
|
| 210 |
+
"parameters": {
|
| 211 |
+
"type": "object",
|
| 212 |
+
"properties": {
|
| 213 |
+
"power": {
|
| 214 |
+
"type": "boolean",
|
| 215 |
+
"description": "Whether to turn the disco ball on or off.",
|
| 216 |
+
}
|
| 217 |
+
},
|
| 218 |
+
"required": ["power"],
|
| 219 |
+
},
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
start_music = {
|
| 223 |
+
"name": "start_music",
|
| 224 |
+
"description": "Play some music matching the specified parameters.",
|
| 225 |
+
"parameters": {
|
| 226 |
+
"type": "object",
|
| 227 |
+
"properties": {
|
| 228 |
+
"energetic": {
|
| 229 |
+
"type": "boolean",
|
| 230 |
+
"description": "Whether the music is energetic or not.",
|
| 231 |
+
},
|
| 232 |
+
"loud": {
|
| 233 |
+
"type": "boolean",
|
| 234 |
+
"description": "Whether the music is loud or not.",
|
| 235 |
+
},
|
| 236 |
+
},
|
| 237 |
+
"required": ["energetic", "loud"],
|
| 238 |
+
},
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
dim_lights = {
|
| 242 |
+
"name": "dim_lights",
|
| 243 |
+
"description": "Dim the lights.",
|
| 244 |
+
"parameters": {
|
| 245 |
+
"type": "object",
|
| 246 |
+
"properties": {
|
| 247 |
+
"brightness": {
|
| 248 |
+
"type": "number",
|
| 249 |
+
"description": "The brightness of the lights, 0.0 is off, 1.0 is full.",
|
| 250 |
+
}
|
| 251 |
+
},
|
| 252 |
+
"required": ["brightness"],
|
| 253 |
+
},
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about [configuring function calling](/gemini-api/docs/function-calling#function_calling_modes).
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
from google import genai
|
| 261 |
+
from google.genai import types
|
| 262 |
+
|
| 263 |
+
# Configure the client and tools
|
| 264 |
+
client = genai.Client()
|
| 265 |
+
house_tools = [
|
| 266 |
+
types.Tool(function_declarations=[power_disco_ball, start_music, dim_lights])
|
| 267 |
+
]
|
| 268 |
+
config = types.GenerateContentConfig(
|
| 269 |
+
tools=house_tools,
|
| 270 |
+
automatic_function_calling=types.AutomaticFunctionCallingConfig(
|
| 271 |
+
disable=True
|
| 272 |
+
),
|
| 273 |
+
# Force the model to call 'any' function, instead of chatting.
|
| 274 |
+
tool_config=types.ToolConfig(
|
| 275 |
+
function_calling_config=types.FunctionCallingConfig(mode='ANY')
|
| 276 |
+
),
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
chat = client.chats.create(model="gemini-2.5-flash", config=config)
|
| 280 |
+
response = chat.send_message("Turn this place into a party!")
|
| 281 |
+
|
| 282 |
+
# Print out each of the function calls requested from this single call
|
| 283 |
+
print("Example 1: Forced function calling")
|
| 284 |
+
for fn in response.function_calls:
|
| 285 |
+
args = ", ".join(f"{key}={val}" for key, val in fn.args.items())
|
| 286 |
+
print(f"{fn.name}({args})")
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested.
|
| 290 |
+
|
| 291 |
+
The Python SDK supports [automatic function calling](/gemini-api/docs/function-calling#automatic_function_calling_python_only), which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case.
|
| 292 |
+
|
| 293 |
+
**Note:** Automatic Function Calling is a Python SDK only feature at the moment.
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
from google import genai
|
| 297 |
+
from google.genai import types
|
| 298 |
+
|
| 299 |
+
# Actual function implementations
|
| 300 |
+
def power_disco_ball_impl(power: bool) -> dict:
|
| 301 |
+
"""Powers the spinning disco ball.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
power: Whether to turn the disco ball on or off.
|
| 305 |
+
|
| 306 |
+
Returns:
|
| 307 |
+
A status dictionary indicating the current state.
|
| 308 |
+
"""
|
| 309 |
+
return {"status": f"Disco ball powered {'on' if power else 'off'}"}
|
| 310 |
+
|
| 311 |
+
def start_music_impl(energetic: bool, loud: bool) -> dict:
|
| 312 |
+
"""Play some music matching the specified parameters.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
energetic: Whether the music is energetic or not.
|
| 316 |
+
loud: Whether the music is loud or not.
|
| 317 |
+
|
| 318 |
+
Returns:
|
| 319 |
+
A dictionary containing the music settings.
|
| 320 |
+
"""
|
| 321 |
+
music_type = "energetic" if energetic else "chill"
|
| 322 |
+
volume = "loud" if loud else "quiet"
|
| 323 |
+
return {"music_type": music_type, "volume": volume}
|
| 324 |
+
|
| 325 |
+
def dim_lights_impl(brightness: float) -> dict:
|
| 326 |
+
"""Dim the lights.
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
brightness: The brightness of the lights, 0.0 is off, 1.0 is full.
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
A dictionary containing the new brightness setting.
|
| 333 |
+
"""
|
| 334 |
+
return {"brightness": brightness}
|
| 335 |
+
|
| 336 |
+
# Configure the client
|
| 337 |
+
client = genai.Client()
|
| 338 |
+
config = types.GenerateContentConfig(
|
| 339 |
+
tools=[power_disco_ball_impl, start_music_impl, dim_lights_impl]
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
# Make the request
|
| 343 |
+
response = client.models.generate_content(
|
| 344 |
+
model="gemini-2.5-flash",
|
| 345 |
+
contents="Do everything you need to this place into party!",
|
| 346 |
+
config=config,
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
print("\nExample 2: Automatic function calling")
|
| 350 |
+
print(response.text)
|
| 351 |
+
# I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started!
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
## Compositional function calling
|
| 355 |
+
|
| 356 |
+
Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a `get_current_location()` function followed by a `get_weather()` function that takes the location as a parameter.
|
| 357 |
+
|
| 358 |
+
The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling.
|
| 359 |
+
|
| 360 |
+
This example uses the automatic function calling feature of the `google-genai` Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task.
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
import os
|
| 364 |
+
from google import genai
|
| 365 |
+
from google.genai import types
|
| 366 |
+
|
| 367 |
+
# Example Functions
|
| 368 |
+
def get_weather_forecast(location: str) -> dict:
|
| 369 |
+
"""Gets the current weather temperature for a given location."""
|
| 370 |
+
print(f"Tool Call: get_weather_forecast(location={location})")
|
| 371 |
+
# TODO: Make API call
|
| 372 |
+
print("Tool Response: {'temperature': 25, 'unit': 'celsius'}")
|
| 373 |
+
return {"temperature": 25, "unit": "celsius"} # Dummy response
|
| 374 |
+
|
| 375 |
+
def set_thermostat_temperature(temperature: int) -> dict:
|
| 376 |
+
"""Sets the thermostat to a desired temperature."""
|
| 377 |
+
print(f"Tool Call: set_thermostat_temperature(temperature={temperature})")
|
| 378 |
+
# TODO: Interact with a thermostat API
|
| 379 |
+
print("Tool Response: {'status': 'success'}")
|
| 380 |
+
return {"status": "success"}
|
| 381 |
+
|
| 382 |
+
# Configure the client and model
|
| 383 |
+
client = genai.Client()
|
| 384 |
+
config = types.GenerateContentConfig(
|
| 385 |
+
tools=[get_weather_forecast, set_thermostat_temperature]
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
# Make the request
|
| 389 |
+
response = client.models.generate_content(
|
| 390 |
+
model="gemini-2.5-flash",
|
| 391 |
+
contents="If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C.",
|
| 392 |
+
config=config,
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# Print the final, user-facing response
|
| 396 |
+
print(response.text)
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
**Expected Output**
|
| 400 |
+
|
| 401 |
+
When you run the code, you will see the SDK orchestrating the function calls. The model first calls `get_weather_forecast`, receives the temperature, and then calls `set_thermostat_temperature` with the correct value based on the logic in the prompt.
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
Tool Call: get_weather_forecast(location=London)
|
| 405 |
+
Tool Response: {'temperature': 25, 'unit': 'celsius'}
|
| 406 |
+
Tool Call: set_thermostat_temperature(temperature=20)
|
| 407 |
+
Tool Response: {'status': 'success'}
|
| 408 |
+
OK. I've set the thermostat to 20°C.
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
Compositional function calling is a native [Live API](https://ai.google.dev/gemini-api/docs/live) feature. This means Live API can handle the function calling similar to the Python SDK.
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
# Light control schemas
|
| 415 |
+
turn_on_the_lights_schema = {'name': 'turn_on_the_lights'}
|
| 416 |
+
turn_off_the_lights_schema = {'name': 'turn_off_the_lights'}
|
| 417 |
+
|
| 418 |
+
prompt = """
|
| 419 |
+
Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights?
|
| 420 |
+
"""
|
| 421 |
+
|
| 422 |
+
tools = [
|
| 423 |
+
{'code_execution': {}},
|
| 424 |
+
{'function_declarations': [turn_on_the_lights_schema, turn_off_the_lights_schema]}
|
| 425 |
+
]
|
| 426 |
+
|
| 427 |
+
await run(prompt, tools=tools, modality="AUDIO")
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
## Function calling modes
|
| 431 |
+
|
| 432 |
+
The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the.`function_calling_config`.
|
| 433 |
+
|
| 434 |
+
* `AUTO (Default)`: The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios.
|
| 435 |
+
* `ANY`: The model is constrained to always predict a function call and guarantees function schema adherence. If `allowed_function_names` is not specified, the model can choose from any of the provided function declarations. If `allowed_function_names` is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable).
|
| 436 |
+
* `NONE`: The model is _prohibited_ from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions.
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
from google.genai import types
|
| 443 |
+
|
| 444 |
+
# Configure function calling mode
|
| 445 |
+
tool_config = types.ToolConfig(
|
| 446 |
+
function_calling_config=types.FunctionCallingConfig(
|
| 447 |
+
mode="ANY", allowed_function_names=["get_current_temperature"]
|
| 448 |
+
)
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
# Create the generation config
|
| 452 |
+
config = types.GenerateContentConfig(
|
| 453 |
+
tools=[tools], # not defined here.
|
| 454 |
+
tool_config=tool_config,
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
## Automatic function calling (Python only)
|
| 459 |
+
|
| 460 |
+
When using the Python SDK, you can provide Python functions directly as tools. The SDK converts these functions into declarations, manages the function call execution, and handles the response cycle for you. Define your function with type hints and a docstring. For optimal results, it is recommended to use [Google-style docstrings.](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods) The SDK will then automatically:
|
| 461 |
+
|
| 462 |
+
1. Detect function call responses from the model.
|
| 463 |
+
2. Call the corresponding Python function in your code.
|
| 464 |
+
3. Send the function's response back to the model.
|
| 465 |
+
4. Return the model's final text response.
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
The SDK currently does not parse argument descriptions into the property description slots of the generated function declaration. Instead, it sends the entire docstring as the top-level function description.
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
from google import genai
|
| 473 |
+
from google.genai import types
|
| 474 |
+
|
| 475 |
+
# Define the function with type hints and docstring
|
| 476 |
+
def get_current_temperature(location: str) -> dict:
|
| 477 |
+
"""Gets the current temperature for a given location.
|
| 478 |
+
|
| 479 |
+
Args:
|
| 480 |
+
location: The city and state, e.g. San Francisco, CA
|
| 481 |
+
|
| 482 |
+
Returns:
|
| 483 |
+
A dictionary containing the temperature and unit.
|
| 484 |
+
"""
|
| 485 |
+
# ... (implementation) ...
|
| 486 |
+
return {"temperature": 25, "unit": "Celsius"}
|
| 487 |
+
|
| 488 |
+
# Configure the client
|
| 489 |
+
client = genai.Client()
|
| 490 |
+
config = types.GenerateContentConfig(
|
| 491 |
+
tools=[get_current_temperature]
|
| 492 |
+
) # Pass the function itself
|
| 493 |
+
|
| 494 |
+
# Make the request
|
| 495 |
+
response = client.models.generate_content(
|
| 496 |
+
model="gemini-2.5-flash",
|
| 497 |
+
contents="What's the temperature in Boston?",
|
| 498 |
+
config=config,
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
print(response.text) # The SDK handles the function call and returns the final text
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
You can disable automatic function calling with:
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
config = types.GenerateContentConfig(
|
| 508 |
+
tools=[get_current_temperature],
|
| 509 |
+
automatic_function_calling=types.AutomaticFunctionCallingConfig(disable=True)
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
### Automatic function schema declaration
|
| 514 |
+
|
| 515 |
+
The API is able to describe any of the following types. `Pydantic` types are allowed, as long as the fields defined on them are also composed of allowed types. Dict types (like `dict[str: int]`) are not well supported here, don't use them.
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
AllowedType = (
|
| 519 |
+
int | float | bool | str | list['AllowedType'] | pydantic.BaseModel)
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
To see what the inferred schema looks like, you can convert it using [`from_callable`](https://googleapis.github.io/python-genai/genai.html#genai.types.FunctionDeclaration.from_callable):
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def multiply(a: float, b: float):
|
| 526 |
+
"""Returns a * b."""
|
| 527 |
+
return a * b
|
| 528 |
+
|
| 529 |
+
fn_decl = types.FunctionDeclaration.from_callable(callable=multiply, client=client)
|
| 530 |
+
|
| 531 |
+
# to_json_dict() provides a clean JSON representation.
|
| 532 |
+
print(fn_decl.to_json_dict())
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
## Multi-tool use: Combine native tools with function calling
|
| 536 |
+
|
| 537 |
+
You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, [Grounding with Google Search](/gemini-api/docs/grounding) and [code execution](/gemini-api/docs/code-execution), in a request using the [Live API](/gemini-api/docs/live).
|
| 538 |
+
|
| 539 |
+
**Note:** Multi-tool use is a-[Live API](https://ai.google.dev/gemini-api/docs/live) only feature at the moment. The `run()` function declaration, which handles the asynchronous websocket setup, is omitted for brevity.
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
# Multiple tasks example - combining lights, code execution, and search
|
| 543 |
+
prompt = """
|
| 544 |
+
Hey, I need you to do three things for me.
|
| 545 |
+
|
| 546 |
+
1. Turn on the lights.
|
| 547 |
+
2. Then compute the largest prime palindrome under 100000.
|
| 548 |
+
3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024.
|
| 549 |
+
|
| 550 |
+
Thanks!
|
| 551 |
+
"""
|
| 552 |
+
|
| 553 |
+
tools = [
|
| 554 |
+
{'google_search': {}},
|
| 555 |
+
{'code_execution': {}},
|
| 556 |
+
{'function_declarations': [turn_on_the_lights_schema, turn_off_the_lights_schema]} # not defined here.
|
| 557 |
+
]
|
| 558 |
+
|
| 559 |
+
# Execute the prompt with specified tools in audio modality
|
| 560 |
+
await run(prompt, tools=tools, modality="AUDIO")
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
Python developers can try this out in the [Live API Tool Use notebook](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_LiveAPI_tools.ipynb).
|
| 564 |
+
|
| 565 |
+
## Model context protocol (MCP)
|
| 566 |
+
|
| 567 |
+
[Model Context Protocol (MCP)](https://modelcontextprotocol.io/introduction) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts.
|
| 568 |
+
|
| 569 |
+
The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering [automatic tool calling](/gemini-api/docs/function-calling#automatic_function_calling_python_only) for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model.
|
| 570 |
+
|
| 571 |
+
Here, you can find an example of how to use a local MCP server with Gemini and `mcp` SDK.
|
| 572 |
+
|
| 573 |
+
Make sure the latest version of the [`mcp` SDK](https://modelcontextprotocol.io/introduction) is installed on your platform of choice.
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
pip install mcp
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
**Note:** Python supports automatic tool calling by passing in the `ClientSession` into the `tools` parameters. If you want to disable it, you can provide `automatic_function_calling` with disabled `True`.
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
import os
|
| 583 |
+
import asyncio
|
| 584 |
+
from datetime import datetime
|
| 585 |
+
from mcp import ClientSession, StdioServerParameters
|
| 586 |
+
from mcp.client.stdio import stdio_client
|
| 587 |
+
from google import genai
|
| 588 |
+
|
| 589 |
+
client = genai.Client()
|
| 590 |
+
|
| 591 |
+
# Create server parameters for stdio connection
|
| 592 |
+
server_params = StdioServerParameters(
|
| 593 |
+
command="npx", # Executable
|
| 594 |
+
args=["-y", "@philschmid/weather-mcp"], # MCP Server
|
| 595 |
+
env=None, # Optional environment variables
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
async def run():
|
| 599 |
+
async with stdio_client(server_params) as (read, write):
|
| 600 |
+
async with ClientSession(read, write) as session:
|
| 601 |
+
# Prompt to get the weather for the current day in London.
|
| 602 |
+
prompt = f"What is the weather in London in {datetime.now().strftime('%Y-%m-%d')}?"
|
| 603 |
+
|
| 604 |
+
# Initialize the connection between client and server
|
| 605 |
+
await session.initialize()
|
| 606 |
+
|
| 607 |
+
# Send request to the model with MCP function declarations
|
| 608 |
+
response = await client.aio.models.generate_content(
|
| 609 |
+
model="gemini-2.5-flash",
|
| 610 |
+
contents=prompt,
|
| 611 |
+
config=genai.types.GenerateContentConfig(
|
| 612 |
+
temperature=0,
|
| 613 |
+
tools=[session], # uses the session, will automatically call the tool
|
| 614 |
+
# Uncomment if you **don't** want the SDK to automatically call the tool
|
| 615 |
+
# automatic_function_calling=genai.types.AutomaticFunctionCallingConfig(
|
| 616 |
+
# disable=True
|
| 617 |
+
# ),
|
| 618 |
+
),
|
| 619 |
+
)
|
| 620 |
+
print(response.text)
|
| 621 |
+
|
| 622 |
+
# Start the asyncio event loop and run the main function
|
| 623 |
+
asyncio.run(run())
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
### Limitations with built-in MCP support
|
| 627 |
+
|
| 628 |
+
Built-in MCP support is a [experimental](/gemini-api/docs/models#preview) feature in our SDKs and has the following limitations:
|
| 629 |
+
|
| 630 |
+
* Only tools are supported, not resources nor prompts
|
| 631 |
+
* It is available for the Python and JavaScript/TypeScript SDK.
|
| 632 |
+
* Breaking changes might occur in future releases.
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
Manual integration of MCP servers is always an option if these limit what you're building.
|
| 637 |
+
|
| 638 |
+
## Supported models
|
| 639 |
+
|
| 640 |
+
This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the [model overview](https://ai.google.dev/gemini-api/docs/models) page.
|
| 641 |
+
|
| 642 |
+
Model | Function Calling | Parallel Function Calling | Compositional Function Calling
|
| 643 |
+
---|---|---|---
|
| 644 |
+
Gemini 2.5 Pro | ✔️ | ✔️ | ✔️
|
| 645 |
+
Gemini 2.5 Flash | ✔️ | ✔️ | ✔️
|
| 646 |
+
Gemini 2.5 Flash-Lite | ✔️ | ✔️ | ✔️
|
| 647 |
+
Gemini 2.0 Flash | ✔️ | ✔️ | ✔️
|
| 648 |
+
Gemini 2.0 Flash-Lite | X | X | X
|
| 649 |
+
|
| 650 |
+
## Best practices
|
| 651 |
+
|
| 652 |
+
* **Function and Parameter Descriptions:** Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments.
|
| 653 |
+
* **Naming:** Use descriptive function names (without spaces, periods, or dashes).
|
| 654 |
+
* **Strong Typing:** Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum.
|
| 655 |
+
* **Tool Selection:** While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools.
|
| 656 |
+
* **Prompt Engineering:**
|
| 657 |
+
* Provide context: Tell the model its role (e.g., "You are a helpful weather assistant.").
|
| 658 |
+
* Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts.").
|
| 659 |
+
* Encourage clarification: Instruct the model to ask clarifying questions if needed.
|
| 660 |
+
* **Temperature:** Use a low temperature (e.g., 0) for more deterministic and reliable function calls.
|
| 661 |
+
* **Validation:** If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it.
|
| 662 |
+
* **Error Handling** : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user.
|
| 663 |
+
* **Security:** Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls.
|
| 664 |
+
* **Token Limits:** Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets.
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
## Notes and limitations
|
| 669 |
+
|
| 670 |
+
* Only a [subset of the OpenAPI schema](https://ai.google.dev/api/caching#FunctionDeclaration) is supported.
|
| 671 |
+
* Supported parameter types in Python are limited.
|
| 672 |
+
* Automatic function calling is a Python SDK feature only.
|
| 673 |
+
|
| 674 |
+
|
documentation_gemini/gemini_thinking.md
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Gemini thinking
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/thinking>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
The [Gemini 2.5 series models](/gemini-api/docs/models) use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis.
|
| 8 |
+
|
| 9 |
+
This guide shows you how to work with Gemini's thinking capabilities using the Gemini API.
|
| 10 |
+
|
| 11 |
+
## Before you begin
|
| 12 |
+
|
| 13 |
+
Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API:
|
| 14 |
+
|
| 15 |
+
* [Try Gemini 2.5 Flash in AI Studio](https://aistudio.google.com/prompts/new_chat?model=gemini-2.5-flash)
|
| 16 |
+
* [Try Gemini 2.5 Pro in AI Studio](https://aistudio.google.com/prompts/new_chat?model=gemini-2.5-pro)
|
| 17 |
+
* [Try Gemini 2.5 Flash-Lite in AI Studio](https://aistudio.google.com/prompts/new_chat?model=gemini-2.5-flash-lite)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
## Generating content with thinking
|
| 22 |
+
|
| 23 |
+
Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the `model` field, as demonstrated in the following [text generation](/gemini-api/docs/text-generation#text-input) example:
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
from google import genai
|
| 27 |
+
|
| 28 |
+
client = genai.Client()
|
| 29 |
+
prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example."
|
| 30 |
+
response = client.models.generate_content(
|
| 31 |
+
model="gemini-2.5-pro",
|
| 32 |
+
contents=prompt
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
print(response.text)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
## Thinking budgets
|
| 39 |
+
|
| 40 |
+
The `thinkingBudget` parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks. If latency is more important, use a lower budget or disable thinking by setting `thinkingBudget` to 0. Setting the `thinkingBudget` to -1 turns on **dynamic thinking** , meaning the model will adjust the budget based on the complexity of the request.
|
| 41 |
+
|
| 42 |
+
The `thinkingBudget` is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget.
|
| 43 |
+
|
| 44 |
+
The following are `thinkingBudget` configuration details for each model type.
|
| 45 |
+
|
| 46 |
+
Model | Default setting
|
| 47 |
+
(Thinking budget is not set) | Range | Disable thinking | Turn on dynamic thinking
|
| 48 |
+
---|---|---|---|---
|
| 49 |
+
**2.5 Pro** | Dynamic thinking: Model decides when and how much to think | `128` to `32768` | N/A: Cannot disable thinking | `thinkingBudget = -1`
|
| 50 |
+
**2.5 Flash** | Dynamic thinking: Model decides when and how much to think | `0` to `24576` | `thinkingBudget = 0` | `thinkingBudget = -1`
|
| 51 |
+
**2.5 Flash Lite** | Model does not think | `512` to `24576` | `thinkingBudget = 0` | `thinkingBudget = -1`
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
from google import genai
|
| 55 |
+
from google.genai import types
|
| 56 |
+
|
| 57 |
+
client = genai.Client()
|
| 58 |
+
|
| 59 |
+
response = client.models.generate_content(
|
| 60 |
+
model="gemini-2.5-pro",
|
| 61 |
+
contents="Provide a list of 3 famous physicists and their key contributions",
|
| 62 |
+
config=types.GenerateContentConfig(
|
| 63 |
+
thinking_config=types.ThinkingConfig(thinking_budget=1024)
|
| 64 |
+
# Turn off thinking:
|
| 65 |
+
# thinking_config=types.ThinkingConfig(thinking_budget=0)
|
| 66 |
+
# Turn on dynamic thinking:
|
| 67 |
+
# thinking_config=types.ThinkingConfig(thinking_budget=-1)
|
| 68 |
+
),
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
print(response.text)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
## Thought summaries
|
| 75 |
+
|
| 76 |
+
Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries.
|
| 77 |
+
|
| 78 |
+
You can enable thought summaries by setting `includeThoughts` to `true` in your request configuration. You can then access the summary by iterating through the `response` parameter's `parts`, and checking the `thought` boolean.
|
| 79 |
+
|
| 80 |
+
Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response:
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
from google import genai
|
| 84 |
+
from google.genai import types
|
| 85 |
+
|
| 86 |
+
client = genai.Client()
|
| 87 |
+
prompt = "What is the sum of the first 50 prime numbers?"
|
| 88 |
+
response = client.models.generate_content(
|
| 89 |
+
model="gemini-2.5-pro",
|
| 90 |
+
contents=prompt,
|
| 91 |
+
config=types.GenerateContentConfig(
|
| 92 |
+
thinking_config=types.ThinkingConfig(
|
| 93 |
+
include_thoughts=True
|
| 94 |
+
)
|
| 95 |
+
)
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
for part in response.candidates[0].content.parts:
|
| 99 |
+
if not part.text:
|
| 100 |
+
continue
|
| 101 |
+
if part.thought:
|
| 102 |
+
print("Thought summary:")
|
| 103 |
+
print(part.text)
|
| 104 |
+
print()
|
| 105 |
+
else:
|
| 106 |
+
print("Answer:")
|
| 107 |
+
print(part.text)
|
| 108 |
+
print()
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation:
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
from google import genai
|
| 115 |
+
from google.genai import types
|
| 116 |
+
|
| 117 |
+
client = genai.Client()
|
| 118 |
+
|
| 119 |
+
prompt = """
|
| 120 |
+
Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue.
|
| 121 |
+
The person who lives in the red house owns a cat.
|
| 122 |
+
Bob does not live in the green house.
|
| 123 |
+
Carol owns a dog.
|
| 124 |
+
The green house is to the left of the red house.
|
| 125 |
+
Alice does not own a cat.
|
| 126 |
+
Who lives in each house, and what pet do they own?
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
thoughts = ""
|
| 130 |
+
answer = ""
|
| 131 |
+
|
| 132 |
+
for chunk in client.models.generate_content_stream(
|
| 133 |
+
model="gemini-2.5-pro",
|
| 134 |
+
contents=prompt,
|
| 135 |
+
config=types.GenerateContentConfig(
|
| 136 |
+
thinking_config=types.ThinkingConfig(
|
| 137 |
+
include_thoughts=True
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
):
|
| 141 |
+
for part in chunk.candidates[0].content.parts:
|
| 142 |
+
if not part.text:
|
| 143 |
+
continue
|
| 144 |
+
elif part.thought:
|
| 145 |
+
if not thoughts:
|
| 146 |
+
print("Thoughts summary:")
|
| 147 |
+
print(part.text)
|
| 148 |
+
thoughts += part.text
|
| 149 |
+
else:
|
| 150 |
+
if not answer:
|
| 151 |
+
print("Answer:")
|
| 152 |
+
print(part.text)
|
| 153 |
+
answer += part.text
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
## Thought signatures
|
| 157 |
+
|
| 158 |
+
Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns.
|
| 159 |
+
|
| 160 |
+
You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and [function calling](/gemini-api/docs/function-calling#thinking) are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests.
|
| 161 |
+
|
| 162 |
+
You will receive thought signatures when:
|
| 163 |
+
|
| 164 |
+
* Thinking is enabled and thoughts are generated.
|
| 165 |
+
* The request includes [function declarations](/gemini-api/docs/function-calling#step-2).
|
| 166 |
+
|
| 167 |
+
**Note:** Thought signatures are only available when you're using function calling, specifically, your request must include [function declarations](/gemini-api/docs/function-calling#step-2).
|
| 168 |
+
|
| 169 |
+
You can find an example of thinking with function calls on the [Function calling](/gemini-api/docs/function-calling#thinking) page.
|
| 170 |
+
|
| 171 |
+
Other usage limitations to consider with function calling include:
|
| 172 |
+
|
| 173 |
+
* Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns.
|
| 174 |
+
* Don't concatenate parts with signatures together.
|
| 175 |
+
* Don't merge one part with a signature with another part without a signature.
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
## Pricing
|
| 180 |
+
|
| 181 |
+
**Note:** **Summaries** are available in the [free and paid tiers](/gemini-api/docs/pricing) of the API. **Thought signatures** will increase the input tokens you are charged when sent back as part of the request.
|
| 182 |
+
|
| 183 |
+
When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the `thoughtsTokenCount` field.
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# ...
|
| 187 |
+
print("Thoughts tokens:",response.usage_metadata.thoughts_token_count)
|
| 188 |
+
print("Output tokens:",response.usage_metadata.candidates_token_count)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API.
|
| 192 |
+
|
| 193 |
+
You can learn more about tokens in the [Token counting](/gemini-api/docs/tokens) guide.
|
| 194 |
+
|
| 195 |
+
## Supported models
|
| 196 |
+
|
| 197 |
+
Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the [model overview](/gemini-api/docs/models) page.
|
| 198 |
+
|
| 199 |
+
## Best practices
|
| 200 |
+
|
| 201 |
+
This section includes some guidance for using thinking models efficiently. As always, following our [prompting guidance and best practices](/gemini-api/docs/prompting-strategies) will get you the best results.
|
| 202 |
+
|
| 203 |
+
### Debugging and steering
|
| 204 |
+
|
| 205 |
+
* **Review reasoning** : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results.
|
| 206 |
+
|
| 207 |
+
* **Provide Guidance in Reasoning** : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response.
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
### Task complexity
|
| 213 |
+
|
| 214 |
+
* **Easy Tasks (Thinking could be OFF):** For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include:
|
| 215 |
+
* "Where was DeepMind founded?"
|
| 216 |
+
* "Is this email asking for a meeting or just providing information?"
|
| 217 |
+
* **Medium Tasks (Default/Some Thinking):** Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like:
|
| 218 |
+
* Analogize photosynthesis and growing up.
|
| 219 |
+
* Compare and contrast electric cars and hybrid cars.
|
| 220 |
+
* **Hard Tasks (Maximum Thinking Capability):** For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include:
|
| 221 |
+
* Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17b is a divisor of 97b.
|
| 222 |
+
* Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible.
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
## Thinking with tools and capabilities
|
| 227 |
+
|
| 228 |
+
Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response.
|
| 229 |
+
|
| 230 |
+
* The [search tool](/gemini-api/docs/grounding) allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics.
|
| 231 |
+
|
| 232 |
+
* The [code execution tool](/gemini-api/docs/code-execution) enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response.
|
| 233 |
+
|
| 234 |
+
* With [structured output](/gemini-api/docs/structured-output), you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications.
|
| 235 |
+
|
| 236 |
+
* [Function calling](/gemini-api/docs/function-calling) connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide.
|
| 237 |
+
|
| 238 |
+
* [URL Context](/gemini-api/docs/url-context) provides the model with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response.
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
You can try examples of using tools with thinking models in the [Thinking cookbook](https://colab.sandbox.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_thinking.ipynb).
|
| 244 |
+
|
| 245 |
+
## What's next?
|
| 246 |
+
|
| 247 |
+
* To work through more in depth examples, like:
|
| 248 |
+
|
| 249 |
+
* Using tools with thinking
|
| 250 |
+
* Streaming with thinking
|
| 251 |
+
* Adjusting the thinking budget for different results
|
| 252 |
+
|
| 253 |
+
and more, try our [Thinking cookbook](https://colab.sandbox.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_thinking.ipynb).
|
| 254 |
+
|
| 255 |
+
* Thinking coverage is now available in our [OpenAI Compatibility](/gemini-api/docs/openai#thinking) guide.
|
| 256 |
+
|
| 257 |
+
* For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the [model page](/gemini-api/docs/models).
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
|
documentation_gemini/grounding_with_google_search.md
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grounding with Google Search
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/google-search>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Grounding with Google Search connects the Gemini model to real-time web content and works with all [available languages](/gemini-api/docs/models/gemini#available-languages). This allows Gemini to provide more accurate answers and cite verifiable sources beyond its knowledge cutoff.
|
| 8 |
+
|
| 9 |
+
Grounding helps you build applications that can:
|
| 10 |
+
|
| 11 |
+
* **Increase factual accuracy:** Reduce model hallucinations by basing responses on real-world information.
|
| 12 |
+
* **Access real-time information:** Answer questions about recent events and topics.
|
| 13 |
+
* **Provide citations:** Build user trust by showing the sources for the model's claims.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
from google import genai
|
| 20 |
+
from google.genai import types
|
| 21 |
+
|
| 22 |
+
# Configure the client
|
| 23 |
+
client = genai.Client()
|
| 24 |
+
|
| 25 |
+
# Define the grounding tool
|
| 26 |
+
grounding_tool = types.Tool(
|
| 27 |
+
google_search=types.GoogleSearch()
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Configure generation settings
|
| 31 |
+
config = types.GenerateContentConfig(
|
| 32 |
+
tools=[grounding_tool]
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Make the request
|
| 36 |
+
response = client.models.generate_content(
|
| 37 |
+
model="gemini-2.5-flash",
|
| 38 |
+
contents="Who won the euro 2024?",
|
| 39 |
+
config=config,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Print the grounded response
|
| 43 |
+
print(response.text)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
You can learn more by trying the [Search tool notebook](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Search_Grounding.ipynb).
|
| 47 |
+
|
| 48 |
+
## How grounding with Google Search works
|
| 49 |
+
|
| 50 |
+
When you enable the `google_search` tool, the model handles the entire workflow of searching, processing, and citing information automatically.
|
| 51 |
+
|
| 52 |
+

|
| 53 |
+
|
| 54 |
+
1. **User Prompt:** Your application sends a user's prompt to the Gemini API with the `google_search` tool enabled.
|
| 55 |
+
2. **Prompt Analysis:** The model analyzes the prompt and determines if a Google Search can improve the answer.
|
| 56 |
+
3. **Google Search:** If needed, the model automatically generates one or multiple search queries and executes them.
|
| 57 |
+
4. **Search Results Processing:** The model processes the search results, synthesizes the information, and formulates a response.
|
| 58 |
+
5. **Grounded Response:** The API returns a final, user-friendly response that is grounded in the search results. This response includes the model's text answer and `groundingMetadata` with the search queries, web results, and citations.
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
## Understanding the Grounding Response
|
| 63 |
+
|
| 64 |
+
When a response is successfully grounded, the response includes a `groundingMetadata` field. This structured data is essential for verifying claims and building a rich citation experience in your application.
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
{
|
| 68 |
+
"candidates": [
|
| 69 |
+
{
|
| 70 |
+
"content": {
|
| 71 |
+
"parts": [
|
| 72 |
+
{
|
| 73 |
+
"text": "Spain won Euro 2024, defeating England 2-1 in the final. This victory marks Spain's record fourth European Championship title."
|
| 74 |
+
}
|
| 75 |
+
],
|
| 76 |
+
"role": "model"
|
| 77 |
+
},
|
| 78 |
+
"groundingMetadata": {
|
| 79 |
+
"webSearchQueries": [
|
| 80 |
+
"UEFA Euro 2024 winner",
|
| 81 |
+
"who won euro 2024"
|
| 82 |
+
],
|
| 83 |
+
"searchEntryPoint": {
|
| 84 |
+
"renderedContent": "<!-- HTML and CSS for the search widget -->"
|
| 85 |
+
},
|
| 86 |
+
"groundingChunks": [
|
| 87 |
+
{"web": {"uri": "https://vertexaisearch.cloud.google.com.....", "title": "aljazeera.com"}},
|
| 88 |
+
{"web": {"uri": "https://vertexaisearch.cloud.google.com.....", "title": "uefa.com"}}
|
| 89 |
+
],
|
| 90 |
+
"groundingSupports": [
|
| 91 |
+
{
|
| 92 |
+
"segment": {"startIndex": 0, "endIndex": 85, "text": "Spain won Euro 2024, defeatin..."},
|
| 93 |
+
"groundingChunkIndices": [0]
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"segment": {"startIndex": 86, "endIndex": 210, "text": "This victory marks Spain's..."},
|
| 97 |
+
"groundingChunkIndices": [0, 1]
|
| 98 |
+
}
|
| 99 |
+
]
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
]
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
The Gemini API returns the following information with the `groundingMetadata`:
|
| 107 |
+
|
| 108 |
+
* `webSearchQueries` : Array of the search queries used. This is useful for debugging and understanding the model's reasoning process.
|
| 109 |
+
* `searchEntryPoint` : Contains the HTML and CSS to render the required Search Suggestions. Full usage requirements are detailed in the [Terms of Service](/gemini-api/terms#grounding-with-google-search).
|
| 110 |
+
* `groundingChunks` : Array of objects containing the web sources (`uri` and `title`).
|
| 111 |
+
* `groundingSupports` : Array of chunks to connect model response `text` to the sources in `groundingChunks`. Each chunk links a text `segment` (defined by `startIndex` and `endIndex`) to one or more `groundingChunkIndices`. This is the key to building inline citations.
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
Grounding with Google Search can also be used in combination with the [URL context tool](/gemini-api/docs/url-context) to ground responses in both public web data and the specific URLs you provide.
|
| 116 |
+
|
| 117 |
+
## Attributing Sources with inline Citations
|
| 118 |
+
|
| 119 |
+
The API returns structured citation data, giving you complete control over how you display sources in your user interface. You can use the `groundingSupports` and `groundingChunks` fields to link the model's statements directly to their sources. Here is a common pattern for processing the metadata to create a response with inline, clickable citations.
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def add_citations(response):
|
| 123 |
+
text = response.text
|
| 124 |
+
supports = response.candidates[0].grounding_metadata.grounding_supports
|
| 125 |
+
chunks = response.candidates[0].grounding_metadata.grounding_chunks
|
| 126 |
+
|
| 127 |
+
# Sort supports by end_index in descending order to avoid shifting issues when inserting.
|
| 128 |
+
sorted_supports = sorted(supports, key=lambda s: s.segment.end_index, reverse=True)
|
| 129 |
+
|
| 130 |
+
for support in sorted_supports:
|
| 131 |
+
end_index = support.segment.end_index
|
| 132 |
+
if support.grounding_chunk_indices:
|
| 133 |
+
# Create citation string like [1](link1)[2](link2)
|
| 134 |
+
citation_links = []
|
| 135 |
+
for i in support.grounding_chunk_indices:
|
| 136 |
+
if i < len(chunks):
|
| 137 |
+
uri = chunks[i].web.uri
|
| 138 |
+
citation_links.append(f"[{i + 1}]({uri})")
|
| 139 |
+
|
| 140 |
+
citation_string = ", ".join(citation_links)
|
| 141 |
+
text = text[:end_index] + citation_string + text[end_index:]
|
| 142 |
+
|
| 143 |
+
return text
|
| 144 |
+
|
| 145 |
+
# Assuming response with grounding metadata
|
| 146 |
+
text_with_citations = add_citations(response)
|
| 147 |
+
print(text_with_citations)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
Spain won Euro 2024, defeating England 2-1 in the final.[1](https:/...), [2](https:/...), [4](https:/...), [5](https:/...) This victory marks Spain's record-breaking fourth European Championship title.[5]((https:/...), [2](https:/...), [3](https:/...), [4](https:/...)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
## Pricing
|
| 155 |
+
|
| 156 |
+
When you use Grounding with Google Search, your project is billed per API request that includes the `google_search` tool. If the model decides to execute multiple search queries to answer a single prompt (for example, searching for `"UEFA Euro 2024 winner"` and `"Spain vs England Euro 2024 final score"` within the same API call), this counts as a single billable use of the tool for that request.
|
| 157 |
+
|
| 158 |
+
For detailed pricing information, see the [Gemini API pricing page](https://ai.google.dev/gemini-api/docs/pricing).
|
| 159 |
+
|
| 160 |
+
## Supported Models
|
| 161 |
+
|
| 162 |
+
Experimental and Preview models are not included. You can find their capabilities on the [model overview](https://ai.google.dev/gemini-api/docs/models) page.
|
| 163 |
+
|
| 164 |
+
Model | Grounding with Google Search
|
| 165 |
+
---|---
|
| 166 |
+
Gemini 2.5 Pro | ✔️
|
| 167 |
+
Gemini 2.5 Flash | ✔️
|
| 168 |
+
Gemini 2.0 Flash | ✔️
|
| 169 |
+
Gemini 1.5 Pro | ✔️
|
| 170 |
+
Gemini 1.5 Flash | ✔️
|
| 171 |
+
**Note:** Older models use a `google_search_retrieval` tool. For all current models, use the `google_search` tool as shown in the examples.
|
| 172 |
+
|
| 173 |
+
## Grounding with Gemini 1.5 Models (Legacy)
|
| 174 |
+
|
| 175 |
+
While the `google_search` tool is recommended for Gemini 2.0 and later, Gemini 1.5 support a legacy tool named `google_search_retrieval`. This tool provides a `dynamic` mode that allows the model to decide whether to perform a search based on its confidence that the prompt requires fresh information. If the model's confidence is above a `dynamic_threshold` you set (a value between 0.0 and 1.0), it will perform a search.
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# Note: This is a legacy approach for Gemini 1.5 models.
|
| 179 |
+
# The 'google_search' tool is recommended for all new development.
|
| 180 |
+
import os
|
| 181 |
+
from google import genai
|
| 182 |
+
from google.genai import types
|
| 183 |
+
|
| 184 |
+
client = genai.Client()
|
| 185 |
+
|
| 186 |
+
retrieval_tool = types.Tool(
|
| 187 |
+
google_search_retrieval=types.GoogleSearchRetrieval(
|
| 188 |
+
dynamic_retrieval_config=types.DynamicRetrievalConfig(
|
| 189 |
+
mode=types.DynamicRetrievalConfigMode.MODE_DYNAMIC,
|
| 190 |
+
dynamic_threshold=0.7 # Only search if confidence > 70%
|
| 191 |
+
)
|
| 192 |
+
)
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
config = types.GenerateContentConfig(
|
| 196 |
+
tools=[retrieval_tool]
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
response = client.models.generate_content(
|
| 200 |
+
model='gemini-1.5-flash',
|
| 201 |
+
contents="Who won the euro 2024?",
|
| 202 |
+
config=config,
|
| 203 |
+
)
|
| 204 |
+
print(response.text)
|
| 205 |
+
if not response.candidates[0].grounding_metadata:
|
| 206 |
+
print("\nModel answered from its own knowledge.")
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
## What's next
|
| 210 |
+
|
| 211 |
+
* Try the [Grounding with Google Search in the Gemini API Cookbook](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Search_Grounding.ipynb).
|
| 212 |
+
* Learn about other available tools, like [Function Calling](/gemini-api/docs/function-calling).
|
| 213 |
+
* Learn how to augment prompts with specific URLs using the [URL context tool](/gemini-api/docs/url-context).
|
| 214 |
+
|
| 215 |
+
|
documentation_gemini/image_generation_with_gemini.md
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Image generation with Gemini
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/image-generation>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. All generated images include a [SynthID watermark](/responsible/docs/safeguards/synthid).
|
| 8 |
+
|
| 9 |
+
Image generation may not be available in all regions and countries, review our [Gemini models](/gemini-api/docs/models#gemini-2.0-flash-preview-image-generation) page for more information.
|
| 10 |
+
|
| 11 |
+
**Note:** You can also generate images with [Imagen](/gemini-api/docs/imagen), our specialized image generation model. See the When to use Imagen section for details on how to choose between Gemini and Imagen.
|
| 12 |
+
|
| 13 |
+
## Image generation (text-to-image)
|
| 14 |
+
|
| 15 |
+
The following code demonstrates how to generate an image based on a descriptive prompt. You must include `responseModalities`: `["TEXT", "IMAGE"]` in your configuration. Image-only output is not supported with these models.
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
from google import genai
|
| 19 |
+
from google.genai import types
|
| 20 |
+
from PIL import Image
|
| 21 |
+
from io import BytesIO
|
| 22 |
+
import base64
|
| 23 |
+
|
| 24 |
+
client = genai.Client()
|
| 25 |
+
|
| 26 |
+
contents = ('Hi, can you create a 3d rendered image of a pig '
|
| 27 |
+
'with wings and a top hat flying over a happy '
|
| 28 |
+
'futuristic scifi city with lots of greenery?')
|
| 29 |
+
|
| 30 |
+
response = client.models.generate_content(
|
| 31 |
+
model="gemini-2.0-flash-preview-image-generation",
|
| 32 |
+
contents=contents,
|
| 33 |
+
config=types.GenerateContentConfig(
|
| 34 |
+
response_modalities=['TEXT', 'IMAGE']
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
for part in response.candidates[0].content.parts:
|
| 39 |
+
if part.text is not None:
|
| 40 |
+
print(part.text)
|
| 41 |
+
elif part.inline_data is not None:
|
| 42 |
+
image = Image.open(BytesIO((part.inline_data.data)))
|
| 43 |
+
image.save('gemini-native-image.png')
|
| 44 |
+
image.show()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
 AI-generated image of a fantastical flying pig
|
| 48 |
+
|
| 49 |
+
## Image editing (text-and-image-to-image)
|
| 50 |
+
|
| 51 |
+
To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the [image input](/gemini-api/docs/image-understanding#image-input) section.
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
from google import genai
|
| 55 |
+
from google.genai import types
|
| 56 |
+
from PIL import Image
|
| 57 |
+
from io import BytesIO
|
| 58 |
+
|
| 59 |
+
import PIL.Image
|
| 60 |
+
|
| 61 |
+
image = PIL.Image.open('/path/to/image.png')
|
| 62 |
+
|
| 63 |
+
client = genai.Client()
|
| 64 |
+
|
| 65 |
+
text_input = ('Hi, This is a picture of me.'
|
| 66 |
+
'Can you add a llama next to me?',)
|
| 67 |
+
|
| 68 |
+
response = client.models.generate_content(
|
| 69 |
+
model="gemini-2.0-flash-preview-image-generation",
|
| 70 |
+
contents=[text_input, image],
|
| 71 |
+
config=types.GenerateContentConfig(
|
| 72 |
+
response_modalities=['TEXT', 'IMAGE']
|
| 73 |
+
)
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
for part in response.candidates[0].content.parts:
|
| 77 |
+
if part.text is not None:
|
| 78 |
+
print(part.text)
|
| 79 |
+
elif part.inline_data is not None:
|
| 80 |
+
image = Image.open(BytesIO((part.inline_data.data)))
|
| 81 |
+
image.show()
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
## Other image generation modes
|
| 85 |
+
|
| 86 |
+
Gemini supports other image interaction modes based on prompt structure and context, including:
|
| 87 |
+
|
| 88 |
+
* **Text to image(s) and text (interleaved):** Outputs images with related text.
|
| 89 |
+
* Example prompt: "Generate an illustrated recipe for a paella."
|
| 90 |
+
* **Image(s) and text to image(s) and text (interleaved)** : Uses input images and text to create new related images and text.
|
| 91 |
+
* Example prompt: (With an image of a furnished room) "What other color sofas would work in my space? can you update the image?"
|
| 92 |
+
* **Multi-turn image editing (chat):** Keep generating / editing images conversationally.
|
| 93 |
+
* Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow."
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
## Limitations
|
| 98 |
+
|
| 99 |
+
* For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN.
|
| 100 |
+
* Image generation does not support audio or video inputs.
|
| 101 |
+
* Image generation may not always trigger:
|
| 102 |
+
* The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image").
|
| 103 |
+
* The model may stop generating partway through. Try again or try a different prompt.
|
| 104 |
+
* When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text.
|
| 105 |
+
* There are some regions/countries where Image generation is not available. See [Models](/gemini-api/docs/models) for more information.
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
## When to use Imagen
|
| 110 |
+
|
| 111 |
+
In addition to using Gemini's built-in image generation capabilities, you can also access [Imagen](/gemini-api/docs/imagen), our specialized image generation model, through the Gemini API.
|
| 112 |
+
|
| 113 |
+
Choose **Gemini** when:
|
| 114 |
+
|
| 115 |
+
* You need contextually relevant images that leverage world knowledge and reasoning.
|
| 116 |
+
* Seamlessly blending text and images is important.
|
| 117 |
+
* You want accurate visuals embedded within long text sequences.
|
| 118 |
+
* You want to edit images conversationally while maintaining context.
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
Choose **Imagen** when:
|
| 123 |
+
|
| 124 |
+
* Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities.
|
| 125 |
+
* Performing specialized editing tasks like product background updates or image upscaling.
|
| 126 |
+
* Infusing branding, style, or generating logos and product designs.
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time.
|
| 131 |
+
|
| 132 |
+
## What's next
|
| 133 |
+
|
| 134 |
+
* Check out the [Veo guide](/gemini-api/docs/video) to learn how to generate videos with the Gemini API.
|
| 135 |
+
* To learn more about Gemini models, see [Gemini models](/gemini-api/docs/models/gemini) and [Experimental models](/gemini-api/docs/models/experimental-models).
|
| 136 |
+
|
| 137 |
+
|
documentation_gemini/image_understanding.md
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Image understanding
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/image-understanding>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models.
|
| 8 |
+
|
| 9 |
+
**Tip:** In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer **improved accuracy** for specific use cases like object detection and segmentation, through additional training. See the Capabilities section for more details.
|
| 10 |
+
|
| 11 |
+
## Passing images to Gemini
|
| 12 |
+
|
| 13 |
+
You can provide images as input to Gemini using two methods:
|
| 14 |
+
|
| 15 |
+
* Passing inline image data: Ideal for smaller files (total request size less than 20MB, including prompts).
|
| 16 |
+
* Uploading images using the File API: Recommended for larger files or for reusing images across multiple requests.
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
### Passing inline image data
|
| 21 |
+
|
| 22 |
+
You can pass inline image data in the request to `generateContent`. You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language).
|
| 23 |
+
|
| 24 |
+
The following example shows how to read an image from a local file and pass it to `generateContent` API for processing.
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
from google.genai import types
|
| 28 |
+
|
| 29 |
+
with open('path/to/small-sample.jpg', 'rb') as f:
|
| 30 |
+
image_bytes = f.read()
|
| 31 |
+
|
| 32 |
+
response = client.models.generate_content(
|
| 33 |
+
model='gemini-2.5-flash',
|
| 34 |
+
contents=[
|
| 35 |
+
types.Part.from_bytes(
|
| 36 |
+
data=image_bytes,
|
| 37 |
+
mime_type='image/jpeg',
|
| 38 |
+
),
|
| 39 |
+
'Caption this image.'
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
print(response.text)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
You can also fetch an image from a URL, convert it to bytes, and pass it to `generateContent` as shown in the following examples.
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
from google import genai
|
| 50 |
+
from google.genai import types
|
| 51 |
+
|
| 52 |
+
import requests
|
| 53 |
+
|
| 54 |
+
image_path = "https://goo.gle/instrument-img"
|
| 55 |
+
image_bytes = requests.get(image_path).content
|
| 56 |
+
image = types.Part.from_bytes(
|
| 57 |
+
data=image_bytes, mime_type="image/jpeg"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
client = genai.Client()
|
| 61 |
+
|
| 62 |
+
response = client.models.generate_content(
|
| 63 |
+
model="gemini-2.5-flash",
|
| 64 |
+
contents=["What is this image?", image],
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
print(response.text)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
**Note:** Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly.
|
| 71 |
+
|
| 72 |
+
### Uploading images using the File API
|
| 73 |
+
|
| 74 |
+
For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to `generateContent`. See the [Files API guide](/gemini-api/docs/files) for more information and examples.
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
from google import genai
|
| 78 |
+
|
| 79 |
+
client = genai.Client()
|
| 80 |
+
|
| 81 |
+
my_file = client.files.upload(file="path/to/sample.jpg")
|
| 82 |
+
|
| 83 |
+
response = client.models.generate_content(
|
| 84 |
+
model="gemini-2.5-flash",
|
| 85 |
+
contents=[my_file, "Caption this image."],
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
print(response.text)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
## Prompting with multiple images
|
| 92 |
+
|
| 93 |
+
You can provide multiple images in a single prompt by including multiple image `Part` objects in the `contents` array. These can be a mix of inline data (local files or URLs) and File API references.
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
from google import genai
|
| 97 |
+
from google.genai import types
|
| 98 |
+
|
| 99 |
+
client = genai.Client()
|
| 100 |
+
|
| 101 |
+
# Upload the first image
|
| 102 |
+
image1_path = "path/to/image1.jpg"
|
| 103 |
+
uploaded_file = client.files.upload(file=image1_path)
|
| 104 |
+
|
| 105 |
+
# Prepare the second image as inline data
|
| 106 |
+
image2_path = "path/to/image2.png"
|
| 107 |
+
with open(image2_path, 'rb') as f:
|
| 108 |
+
img2_bytes = f.read()
|
| 109 |
+
|
| 110 |
+
# Create the prompt with text and multiple images
|
| 111 |
+
response = client.models.generate_content(
|
| 112 |
+
|
| 113 |
+
model="gemini-2.5-flash",
|
| 114 |
+
contents=[
|
| 115 |
+
"What is different between these two images?",
|
| 116 |
+
uploaded_file, # Use the uploaded file reference
|
| 117 |
+
types.Part.from_bytes(
|
| 118 |
+
data=img2_bytes,
|
| 119 |
+
mime_type='image/png'
|
| 120 |
+
)
|
| 121 |
+
]
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
print(response.text)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
## Object detection
|
| 128 |
+
|
| 129 |
+
From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size.
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
from google import genai
|
| 133 |
+
from google.genai import types
|
| 134 |
+
from PIL import Image
|
| 135 |
+
import json
|
| 136 |
+
|
| 137 |
+
client = genai.Client()
|
| 138 |
+
prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000."
|
| 139 |
+
|
| 140 |
+
image = Image.open("/path/to/image.png")
|
| 141 |
+
|
| 142 |
+
config = types.GenerateContentConfig(
|
| 143 |
+
response_mime_type="application/json"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
response = client.models.generate_content(model="gemini-2.5-flash",
|
| 147 |
+
contents=[image, prompt],
|
| 148 |
+
config=config
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
width, height = image.size
|
| 152 |
+
bounding_boxes = json.loads(response.text)
|
| 153 |
+
|
| 154 |
+
converted_bounding_boxes = []
|
| 155 |
+
for bounding_box in bounding_boxes:
|
| 156 |
+
abs_y1 = int(bounding_box["box_2d"][0]/1000 * height)
|
| 157 |
+
abs_x1 = int(bounding_box["box_2d"][1]/1000 * width)
|
| 158 |
+
abs_y2 = int(bounding_box["box_2d"][2]/1000 * height)
|
| 159 |
+
abs_x2 = int(bounding_box["box_2d"][3]/1000 * width)
|
| 160 |
+
converted_bounding_boxes.append([abs_x1, abs_y1, abs_x2, abs_y2])
|
| 161 |
+
|
| 162 |
+
print("Image size: ", width, height)
|
| 163 |
+
print("Bounding boxes:", converted_bounding_boxes)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
**Note:** The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain".
|
| 168 |
+
|
| 169 |
+
For more examples, check following notebooks in the [Gemini Cookbook](https://github.com/google-gemini/cookbook):
|
| 170 |
+
|
| 171 |
+
* [2D spatial understanding notebook](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Spatial_understanding.ipynb)
|
| 172 |
+
* [Experimental 3D pointing notebook](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Spatial_understanding_3d.ipynb)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
## Segmentation
|
| 177 |
+
|
| 178 |
+
Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks.
|
| 179 |
+
|
| 180 |
+
The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box ("`box_2d`") in the format `[y0, x0, y1, x1]` with normalized coordinates between 0 and 1000, a label ("`label`") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint).
|
| 181 |
+
|
| 182 |
+
**Note:** For better results, disable [thinking](/gemini-api/docs/thinking) by setting the thinking budget to 0. See code sample below for an example.
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
from google import genai
|
| 186 |
+
from google.genai import types
|
| 187 |
+
from PIL import Image, ImageDraw
|
| 188 |
+
import io
|
| 189 |
+
import base64
|
| 190 |
+
import json
|
| 191 |
+
import numpy as np
|
| 192 |
+
import os
|
| 193 |
+
|
| 194 |
+
client = genai.Client()
|
| 195 |
+
|
| 196 |
+
def parse_json(json_output: str):
|
| 197 |
+
# Parsing out the markdown fencing
|
| 198 |
+
lines = json_output.splitlines()
|
| 199 |
+
for i, line in enumerate(lines):
|
| 200 |
+
if line == "```json":
|
| 201 |
+
json_output = "\n".join(lines[i+1:]) # Remove everything before "```json"
|
| 202 |
+
output = json_output.split("```")[0] # Remove everything after the closing "```"
|
| 203 |
+
break # Exit the loop once "```json" is found
|
| 204 |
+
return json_output
|
| 205 |
+
|
| 206 |
+
def extract_segmentation_masks(image_path: str, output_dir: str = "segmentation_outputs"):
|
| 207 |
+
# Load and resize image
|
| 208 |
+
im = Image.open(image_path)
|
| 209 |
+
im.thumbnail([1024, 1024], Image.Resampling.LANCZOS)
|
| 210 |
+
|
| 211 |
+
prompt = """
|
| 212 |
+
Give the segmentation masks for the wooden and glass items.
|
| 213 |
+
Output a JSON list of segmentation masks where each entry contains the 2D
|
| 214 |
+
bounding box in the key "box_2d", the segmentation mask in key "mask", and
|
| 215 |
+
the text label in the key "label". Use descriptive labels.
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
config = types.GenerateContentConfig(
|
| 219 |
+
thinking_config=types.ThinkingConfig(thinking_budget=0) # set thinking_budget to 0 for better results in object detection
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
response = client.models.generate_content(
|
| 223 |
+
model="gemini-2.5-flash",
|
| 224 |
+
contents=[prompt, im], # Pillow images can be directly passed as inputs (which will be converted by the SDK)
|
| 225 |
+
config=config
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
# Parse JSON response
|
| 229 |
+
items = json.loads(parse_json(response.text))
|
| 230 |
+
|
| 231 |
+
# Create output directory
|
| 232 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 233 |
+
|
| 234 |
+
# Process each mask
|
| 235 |
+
for i, item in enumerate(items):
|
| 236 |
+
# Get bounding box coordinates
|
| 237 |
+
box = item["box_2d"]
|
| 238 |
+
y0 = int(box[0] / 1000 * im.size[1])
|
| 239 |
+
x0 = int(box[1] / 1000 * im.size[0])
|
| 240 |
+
y1 = int(box[2] / 1000 * im.size[1])
|
| 241 |
+
x1 = int(box[3] / 1000 * im.size[0])
|
| 242 |
+
|
| 243 |
+
# Skip invalid boxes
|
| 244 |
+
if y0 >= y1 or x0 >= x1:
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
# Process mask
|
| 248 |
+
png_str = item["mask"]
|
| 249 |
+
if not png_str.startswith("data:image/png;base64,"):
|
| 250 |
+
continue
|
| 251 |
+
|
| 252 |
+
# Remove prefix
|
| 253 |
+
png_str = png_str.removeprefix("data:image/png;base64,")
|
| 254 |
+
mask_data = base64.b64decode(png_str)
|
| 255 |
+
mask = Image.open(io.BytesIO(mask_data))
|
| 256 |
+
|
| 257 |
+
# Resize mask to match bounding box
|
| 258 |
+
mask = mask.resize((x1 - x0, y1 - y0), Image.Resampling.BILINEAR)
|
| 259 |
+
|
| 260 |
+
# Convert mask to numpy array for processing
|
| 261 |
+
mask_array = np.array(mask)
|
| 262 |
+
|
| 263 |
+
# Create overlay for this mask
|
| 264 |
+
overlay = Image.new('RGBA', im.size, (0, 0, 0, 0))
|
| 265 |
+
overlay_draw = ImageDraw.Draw(overlay)
|
| 266 |
+
|
| 267 |
+
# Create overlay for the mask
|
| 268 |
+
color = (255, 255, 255, 200)
|
| 269 |
+
for y in range(y0, y1):
|
| 270 |
+
for x in range(x0, x1):
|
| 271 |
+
if mask_array[y - y0, x - x0] > 128: # Threshold for mask
|
| 272 |
+
overlay_draw.point((x, y), fill=color)
|
| 273 |
+
|
| 274 |
+
# Save individual mask and its overlay
|
| 275 |
+
mask_filename = f"{item['label']}_{i}_mask.png"
|
| 276 |
+
overlay_filename = f"{item['label']}_{i}_overlay.png"
|
| 277 |
+
|
| 278 |
+
mask.save(os.path.join(output_dir, mask_filename))
|
| 279 |
+
|
| 280 |
+
# Create and save overlay
|
| 281 |
+
composite = Image.alpha_composite(im.convert('RGBA'), overlay)
|
| 282 |
+
composite.save(os.path.join(output_dir, overlay_filename))
|
| 283 |
+
print(f"Saved mask and overlay for {item['label']} to {output_dir}")
|
| 284 |
+
|
| 285 |
+
# Example usage
|
| 286 |
+
if __name__ == "__main__":
|
| 287 |
+
extract_segmentation_masks("path/to/image.png")
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
Check the [segmentation example](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Spatial_understanding.ipynb#scrollTo=WQJTJ8wdGOKx) in the cookbook guide for a more detailed example.
|
| 292 |
+
|
| 293 |
+
 An example segmentation output with objects and segmentation masks
|
| 294 |
+
|
| 295 |
+
## Supported image formats
|
| 296 |
+
|
| 297 |
+
Gemini supports the following image format MIME types:
|
| 298 |
+
|
| 299 |
+
* PNG - `image/png`
|
| 300 |
+
* JPEG - `image/jpeg`
|
| 301 |
+
* WEBP - `image/webp`
|
| 302 |
+
* HEIC - `image/heic`
|
| 303 |
+
* HEIF - `image/heif`
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
## Capabilities
|
| 308 |
+
|
| 309 |
+
All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation.
|
| 310 |
+
|
| 311 |
+
Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements.
|
| 312 |
+
|
| 313 |
+
Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities:
|
| 314 |
+
|
| 315 |
+
* **Gemini 2.0 models** are further trained to support enhanced object detection.
|
| 316 |
+
|
| 317 |
+
* **Gemini 2.5 models** are further trained to support enhanced segmentation in addition to object detection.
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
## Limitations and key technical information
|
| 323 |
+
|
| 324 |
+
### File limit
|
| 325 |
+
|
| 326 |
+
Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request.
|
| 327 |
+
|
| 328 |
+
### Token calculation
|
| 329 |
+
|
| 330 |
+
* **Gemini 1.5 Flash and Gemini 1.5 Pro** : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens.
|
| 331 |
+
* **Gemini 2.0 Flash and Gemini 2.5 Flash/Pro** : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens.
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
## Tips and best practices
|
| 336 |
+
|
| 337 |
+
* Verify that images are correctly rotated.
|
| 338 |
+
* Use clear, non-blurry images.
|
| 339 |
+
* When using a single image with text, place the text prompt _after_ the image part in the `contents` array.
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
## What's next
|
| 344 |
+
|
| 345 |
+
This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources:
|
| 346 |
+
|
| 347 |
+
* [Files API](/gemini-api/docs/files): Learn more about uploading and managing files for use with Gemini.
|
| 348 |
+
* [System instructions](/gemini-api/docs/text-generation#system-instructions): System instructions let you steer the behavior of the model based on your specific needs and use cases.
|
| 349 |
+
* [File prompting strategies](/gemini-api/docs/files#prompt-guide): The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting.
|
| 350 |
+
* [Safety guidance](/gemini-api/docs/safety-guidance): Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs.
|
| 351 |
+
|
| 352 |
+
|
documentation_gemini/long_context.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Long context
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/long-context>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Many Gemini models come with large context windows of 1 million or more tokens. Historically, large language models (LLMs) were significantly limited by the amount of text (or tokens) that could be passed to the model at one time. The Gemini long context window unlocks many new use cases and developer paradigms.
|
| 8 |
+
|
| 9 |
+
The code you already use for cases like [text generation](/gemini-api/docs/text-generation) or [multimodal inputs](/gemini-api/docs/vision) will work without any changes with long context.
|
| 10 |
+
|
| 11 |
+
This document gives you an overview of what you can achieve using models with context windows of 1M and more tokens. The page gives a brief overview of a context window, and explores how developers should think about long context, various real world use cases for long context, and ways to optimize the usage of long context.
|
| 12 |
+
|
| 13 |
+
For the context window sizes of specific models, see the [Models](/gemini-api/docs/models) page.
|
| 14 |
+
|
| 15 |
+
## What is a context window?
|
| 16 |
+
|
| 17 |
+
The basic way you use the Gemini models is by passing information (context) to the model, which will subsequently generate a response. An analogy for the context window is short term memory. There is a limited amount of information that can be stored in someone's short term memory, and the same is true for generative models.
|
| 18 |
+
|
| 19 |
+
You can read more about how models work under the hood in our [generative models guide](/gemini-api/docs/prompting-strategies#under-the-hood).
|
| 20 |
+
|
| 21 |
+
## Getting started with long context
|
| 22 |
+
|
| 23 |
+
Earlier versions of generative models were only able to process 8,000 tokens at a time. Newer models pushed this further by accepting 32,000 or even 128,000 tokens. Gemini is the first model capable of accepting 1 million tokens.
|
| 24 |
+
|
| 25 |
+
In practice, 1 million tokens would look like:
|
| 26 |
+
|
| 27 |
+
* 50,000 lines of code (with the standard 80 characters per line)
|
| 28 |
+
* All the text messages you have sent in the last 5 years
|
| 29 |
+
* 8 average length English novels
|
| 30 |
+
* Transcripts of over 200 average length podcast episodes
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
The more limited context windows common in many other models often require strategies like arbitrarily dropping old messages, summarizing content, using RAG with vector databases, or filtering prompts to save tokens.
|
| 35 |
+
|
| 36 |
+
While these techniques remain valuable in specific scenarios, Gemini's extensive context window invites a more direct approach: providing all relevant information upfront. Because Gemini models were purpose-built with massive context capabilities, they demonstrate powerful in-context learning. For example, using only in-context instructional materials (a 500-page reference grammar, a dictionary, and ≈400 parallel sentences), Gemini [learned to translate](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf) from English to Kalamang—a Papuan language with fewer than 200 speakers—with quality similar to a human learner using the same materials. This illustrates the paradigm shift enabled by Gemini's long context, empowering new possibilities through robust in-context learning.
|
| 37 |
+
|
| 38 |
+
## Long context use cases
|
| 39 |
+
|
| 40 |
+
While the standard use case for most generative models is still text input, the Gemini model family enables a new paradigm of multimodal use cases. These models can natively understand text, video, audio, and images. They are accompanied by the [Gemini API that takes in multimodal file types](/gemini-api/docs/prompting_with_media) for convenience.
|
| 41 |
+
|
| 42 |
+
### Long form text
|
| 43 |
+
|
| 44 |
+
Text has proved to be the layer of intelligence underpinning much of the momentum around LLMs. As mentioned earlier, much of the practical limitation of LLMs was because of not having a large enough context window to do certain tasks. This led to the rapid adoption of retrieval augmented generation (RAG) and other techniques which dynamically provide the model with relevant contextual information. Now, with larger and larger context windows, there are new techniques becoming available which unlock new use cases.
|
| 45 |
+
|
| 46 |
+
Some emerging and standard use cases for text based long context include:
|
| 47 |
+
|
| 48 |
+
* Summarizing large corpuses of text
|
| 49 |
+
* Previous summarization options with smaller context models would require a sliding window or another technique to keep state of previous sections as new tokens are passed to the model
|
| 50 |
+
* Question and answering
|
| 51 |
+
* Historically this was only possible with RAG given the limited amount of context and models' factual recall being low
|
| 52 |
+
* Agentic workflows
|
| 53 |
+
* Text is the underpinning of how agents keep state of what they have done and what they need to do; not having enough information about the world and the agent's goal is a limitation on the reliability of agents
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
[Many-shot in-context learning](https://arxiv.org/pdf/2404.11018) is one of the most unique capabilities unlocked by long context models. Research has shown that taking the common "single shot" or "multi-shot" example paradigm, where the model is presented with one or a few examples of a task, and scaling that up to hundreds, thousands, or even hundreds of thousands of examples, can lead to novel model capabilities. This many-shot approach has also been shown to perform similarly to models which were fine-tuned for a specific task. For use cases where a Gemini model's performance is not yet sufficient for a production rollout, you can try the many-shot approach. As you might explore later in the long context optimization section, context caching makes this type of high input token workload much more economically feasible and even lower latency in some cases.
|
| 58 |
+
|
| 59 |
+
### Long form video
|
| 60 |
+
|
| 61 |
+
Video content's utility has long been constrained by the lack of accessibility of the medium itself. It was hard to skim the content, transcripts often failed to capture the nuance of a video, and most tools don't process image, text, and audio together. With Gemini, the long-context text capabilities translate to the ability to reason and answer questions about multimodal inputs with sustained performance.
|
| 62 |
+
|
| 63 |
+
Some emerging and standard use cases for video long context include:
|
| 64 |
+
|
| 65 |
+
* Video question and answering
|
| 66 |
+
* Video memory, as shown with [Google's Project Astra](https://deepmind.google/technologies/gemini/project-astra/)
|
| 67 |
+
* Video captioning
|
| 68 |
+
* Video recommendation systems, by enriching existing metadata with new multimodal understanding
|
| 69 |
+
* Video customization, by looking at a corpus of data and associated video metadata and then removing parts of videos that are not relevant to the viewer
|
| 70 |
+
* Video content moderation
|
| 71 |
+
* Real-time video processing
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
When working with videos, it is important to consider how the [videos are processed into tokens](/gemini-api/docs/tokens#media-token), which affects billing and usage limits. You can learn more about prompting with video files in the [Prompting guide](/gemini-api/docs/prompting_with_media?lang=python#prompting-with-videos).
|
| 76 |
+
|
| 77 |
+
### Long form audio
|
| 78 |
+
|
| 79 |
+
The Gemini models were the first natively multimodal large language models that could understand audio. Historically, the typical developer workflow would involve stringing together multiple domain specific models, like a speech-to-text model and a text-to-text model, in order to process audio. This led to additional latency required by performing multiple round-trip requests and decreased performance usually attributed to disconnected architectures of the multiple model setup.
|
| 80 |
+
|
| 81 |
+
Some emerging and standard use cases for audio context include:
|
| 82 |
+
|
| 83 |
+
* Real-time transcription and translation
|
| 84 |
+
* Podcast / video question and answering
|
| 85 |
+
* Meeting transcription and summarization
|
| 86 |
+
* Voice assistants
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
You can learn more about prompting with audio files in the [Prompting guide](/gemini-api/docs/prompting_with_media?lang=python#prompting-with-videos).
|
| 91 |
+
|
| 92 |
+
## Long context optimizations
|
| 93 |
+
|
| 94 |
+
The primary optimization when working with long context and the Gemini models is to use [context caching](/gemini-api/docs/caching). Beyond the previous impossibility of processing lots of tokens in a single request, the other main constraint was the cost. If you have a "chat with your data" app where a user uploads 10 PDFs, a video, and some work documents, you would historically have to work with a more complex retrieval augmented generation (RAG) tool / framework in order to process these requests and pay a significant amount for tokens moved into the context window. Now, you can cache the files the user uploads and pay to store them on a per hour basis. The input / output cost per request with Gemini Flash for example is ~4x less than the standard input / output cost, so if the user chats with their data enough, it becomes a huge cost saving for you as the developer.
|
| 95 |
+
|
| 96 |
+
## Long context limitations
|
| 97 |
+
|
| 98 |
+
In various sections of this guide, we talked about how Gemini models achieve high performance across various needle-in-a-haystack retrieval evals. These tests consider the most basic setup, where you have a single needle you are looking for. In cases where you might have multiple "needles" or specific pieces of information you are looking for, the model does not perform with the same accuracy. Performance can vary to a wide degree depending on the context. This is important to consider as there is an inherent tradeoff between getting the right information retrieved and cost. You can get ~99% on a single query, but you have to pay the input token cost every time you send that query. So for 100 pieces of information to be retrieved, if you needed 99% performance, you would likely need to send 100 requests. This is a good example of where context caching can significantly reduce the cost associated with using Gemini models while keeping the performance high.
|
| 99 |
+
|
| 100 |
+
## FAQs
|
| 101 |
+
|
| 102 |
+
### Where is the best place to put my query in the context window?
|
| 103 |
+
|
| 104 |
+
In most cases, especially if the total context is long, the model's performance will be better if you put your query / question at the end of the prompt (after all the other context).
|
| 105 |
+
|
| 106 |
+
### Do I lose model performance when I add more tokens to a query?
|
| 107 |
+
|
| 108 |
+
Generally, if you don't need tokens to be passed to the model, it is best to avoid passing them. However, if you have a large chunk of tokens with some information and want to ask questions about that information, the model is highly capable of extracting that information (up to 99% accuracy in many cases).
|
| 109 |
+
|
| 110 |
+
### How can I lower my cost with long-context queries?
|
| 111 |
+
|
| 112 |
+
If you have a similar set of tokens / context that you want to re-use many times, [context caching](/gemini-api/docs/caching) can help reduce the costs associated with asking questions about that information.
|
| 113 |
+
|
| 114 |
+
### Does the context length affect the model latency?
|
| 115 |
+
|
| 116 |
+
There is some fixed amount of latency in any given request, regardless of the size, but generally longer queries will have higher latency (time to first token).
|
documentation_gemini/speech_generation_text-to-speech.md
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Speech generation (text-to-speech)
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/speech-generation>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is _controllable_ , meaning you can use natural language to structure interactions and guide the _style_ , _accent_ , _pace_ , and _tone_ of the audio.
|
| 8 |
+
|
| 9 |
+
The TTS capability differs from speech generation provided through the [Live API](/gemini-api/docs/live), which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation.
|
| 10 |
+
|
| 11 |
+
This guide shows you how to generate single-speaker and multi-speaker audio from text.
|
| 12 |
+
|
| 13 |
+
**Preview:** Native text-to-speech (TTS) is in [Preview](/gemini-api/docs/models#preview).
|
| 14 |
+
|
| 15 |
+
## Before you begin
|
| 16 |
+
|
| 17 |
+
Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the [Supported models](/gemini-api/docs/speech-generation#supported-models) section. For optimal results, consider which model best fits your specific use case.
|
| 18 |
+
|
| 19 |
+
You may find it useful to [test the Gemini 2.5 TTS models in AI Studio](https://aistudio.google.com/generate-speech) before you start building.
|
| 20 |
+
|
| 21 |
+
**Note:** TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the [Limitations](/gemini-api/docs/speech-generation#limitations) section.
|
| 22 |
+
|
| 23 |
+
## Single-speaker text-to-speech
|
| 24 |
+
|
| 25 |
+
To convert text to single-speaker audio, set the response modality to "audio", and pass a `SpeechConfig` object with `VoiceConfig` set. You'll need to choose a voice name from the prebuilt output voices.
|
| 26 |
+
|
| 27 |
+
This example saves the output audio from the model in a wave file:
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
from google import genai
|
| 31 |
+
from google.genai import types
|
| 32 |
+
import wave
|
| 33 |
+
|
| 34 |
+
# Set up the wave file to save the output:
|
| 35 |
+
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
|
| 36 |
+
with wave.open(filename, "wb") as wf:
|
| 37 |
+
wf.setnchannels(channels)
|
| 38 |
+
wf.setsampwidth(sample_width)
|
| 39 |
+
wf.setframerate(rate)
|
| 40 |
+
wf.writeframes(pcm)
|
| 41 |
+
|
| 42 |
+
client = genai.Client()
|
| 43 |
+
|
| 44 |
+
response = client.models.generate_content(
|
| 45 |
+
model="gemini-2.5-flash-preview-tts",
|
| 46 |
+
contents="Say cheerfully: Have a wonderful day!",
|
| 47 |
+
config=types.GenerateContentConfig(
|
| 48 |
+
response_modalities=["AUDIO"],
|
| 49 |
+
speech_config=types.SpeechConfig(
|
| 50 |
+
voice_config=types.VoiceConfig(
|
| 51 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 52 |
+
voice_name='Kore',
|
| 53 |
+
)
|
| 54 |
+
)
|
| 55 |
+
),
|
| 56 |
+
)
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
data = response.candidates[0].content.parts[0].inline_data.data
|
| 60 |
+
|
| 61 |
+
file_name='out.wav'
|
| 62 |
+
wave_file(file_name, data) # Saves the file to current directory
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository:
|
| 66 |
+
|
| 67 |
+
[View on GitHub](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_TTS.ipynb)
|
| 68 |
+
|
| 69 |
+
## Multi-speaker text-to-speech
|
| 70 |
+
|
| 71 |
+
For multi-speaker audio, you'll need a `MultiSpeakerVoiceConfig` object with each speaker (up to 2) configured as a `SpeakerVoiceConfig`. You'll need to define each `speaker` with the same names used in the prompt:
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
from google import genai
|
| 75 |
+
from google.genai import types
|
| 76 |
+
import wave
|
| 77 |
+
|
| 78 |
+
# Set up the wave file to save the output:
|
| 79 |
+
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
|
| 80 |
+
with wave.open(filename, "wb") as wf:
|
| 81 |
+
wf.setnchannels(channels)
|
| 82 |
+
wf.setsampwidth(sample_width)
|
| 83 |
+
wf.setframerate(rate)
|
| 84 |
+
wf.writeframes(pcm)
|
| 85 |
+
|
| 86 |
+
client = genai.Client()
|
| 87 |
+
|
| 88 |
+
prompt = """TTS the following conversation between Joe and Jane:
|
| 89 |
+
Joe: How's it going today Jane?
|
| 90 |
+
Jane: Not too bad, how about you?"""
|
| 91 |
+
|
| 92 |
+
response = client.models.generate_content(
|
| 93 |
+
model="gemini-2.5-flash-preview-tts",
|
| 94 |
+
contents=prompt,
|
| 95 |
+
config=types.GenerateContentConfig(
|
| 96 |
+
response_modalities=["AUDIO"],
|
| 97 |
+
speech_config=types.SpeechConfig(
|
| 98 |
+
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
|
| 99 |
+
speaker_voice_configs=[
|
| 100 |
+
types.SpeakerVoiceConfig(
|
| 101 |
+
speaker='Joe',
|
| 102 |
+
voice_config=types.VoiceConfig(
|
| 103 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 104 |
+
voice_name='Kore',
|
| 105 |
+
)
|
| 106 |
+
)
|
| 107 |
+
),
|
| 108 |
+
types.SpeakerVoiceConfig(
|
| 109 |
+
speaker='Jane',
|
| 110 |
+
voice_config=types.VoiceConfig(
|
| 111 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 112 |
+
voice_name='Puck',
|
| 113 |
+
)
|
| 114 |
+
)
|
| 115 |
+
),
|
| 116 |
+
]
|
| 117 |
+
)
|
| 118 |
+
)
|
| 119 |
+
)
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
data = response.candidates[0].content.parts[0].inline_data.data
|
| 123 |
+
|
| 124 |
+
file_name='out.wav'
|
| 125 |
+
wave_file(file_name, data) # Saves the file to current directory
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
## Controlling speech style with prompts
|
| 129 |
+
|
| 130 |
+
You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say:
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
Say in an spooky whisper:
|
| 134 |
+
"By the pricking of my thumbs...
|
| 135 |
+
Something wicked this way comes"
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually:
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy:
|
| 142 |
+
|
| 143 |
+
Speaker1: So... what's on the agenda today?
|
| 144 |
+
Speaker2: You're never going to guess!
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, _Enceladus_ 's breathiness might emphasize "tired" and "bored", while _Puck_ 's upbeat tone could complement "excited" and "happy".
|
| 148 |
+
|
| 149 |
+
## Generating a prompt to convert to audio
|
| 150 |
+
|
| 151 |
+
The TTS models only output audio, but you can use [other models](/gemini-api/docs/models) to generate a transcript first, then pass that transcript to the TTS model to read aloud.
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
from google import genai
|
| 155 |
+
from google.genai import types
|
| 156 |
+
|
| 157 |
+
client = genai.Client()
|
| 158 |
+
|
| 159 |
+
transcript = client.models.generate_content(
|
| 160 |
+
model="gemini-2.0-flash",
|
| 161 |
+
contents="""Generate a short transcript around 100 words that reads
|
| 162 |
+
like it was clipped from a podcast by excited herpetologists.
|
| 163 |
+
The hosts names are Dr. Anya and Liam.""").text
|
| 164 |
+
|
| 165 |
+
response = client.models.generate_content(
|
| 166 |
+
model="gemini-2.5-flash-preview-tts",
|
| 167 |
+
contents=transcript,
|
| 168 |
+
config=types.GenerateContentConfig(
|
| 169 |
+
response_modalities=["AUDIO"],
|
| 170 |
+
speech_config=types.SpeechConfig(
|
| 171 |
+
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
|
| 172 |
+
speaker_voice_configs=[
|
| 173 |
+
types.SpeakerVoiceConfig(
|
| 174 |
+
speaker='Dr. Anya',
|
| 175 |
+
voice_config=types.VoiceConfig(
|
| 176 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 177 |
+
voice_name='Kore',
|
| 178 |
+
)
|
| 179 |
+
)
|
| 180 |
+
),
|
| 181 |
+
types.SpeakerVoiceConfig(
|
| 182 |
+
speaker='Liam',
|
| 183 |
+
voice_config=types.VoiceConfig(
|
| 184 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 185 |
+
voice_name='Puck',
|
| 186 |
+
)
|
| 187 |
+
)
|
| 188 |
+
),
|
| 189 |
+
]
|
| 190 |
+
)
|
| 191 |
+
)
|
| 192 |
+
)
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# ...Code to stream or save the output
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
## Voice options
|
| 199 |
+
|
| 200 |
+
TTS models support the following 30 voice options in the `voice_name` field:
|
| 201 |
+
|
| 202 |
+
**Zephyr** \-- _Bright_ | **Puck** \-- _Upbeat_ | **Charon** \-- _Informative_
|
| 203 |
+
---|---|---
|
| 204 |
+
**Kore** \-- _Firm_ | **Fenrir** \-- _Excitable_ | **Leda** \-- _Youthful_
|
| 205 |
+
**Orus** \-- _Firm_ | **Aoede** \-- _Breezy_ | **Callirrhoe** \-- _Easy-going_
|
| 206 |
+
**Autonoe** \-- _Bright_ | **Enceladus** \-- _Breathy_ | **Iapetus** \-- _Clear_
|
| 207 |
+
**Umbriel** \-- _Easy-going_ | **Algieba** \-- _Smooth_ | **Despina** \-- _Smooth_
|
| 208 |
+
**Erinome** \-- _Clear_ | **Algenib** \-- _Gravelly_ | **Rasalgethi** \-- _Informative_
|
| 209 |
+
**Laomedeia** \-- _Upbeat_ | **Achernar** \-- _Soft_ | **Alnilam** \-- _Firm_
|
| 210 |
+
**Schedar** \-- _Even_ | **Gacrux** \-- _Mature_ | **Pulcherrima** \-- _Forward_
|
| 211 |
+
**Achird** \-- _Friendly_ | **Zubenelgenubi** \-- _Casual_ | **Vindemiatrix** \-- _Gentle_
|
| 212 |
+
**Sadachbia** \-- _Lively_ | **Sadaltager** \-- _Knowledgeable_ | **Sulafat** \-- _Warm_
|
| 213 |
+
|
| 214 |
+
You can hear all the voice options in [AI Studio](https://aistudio.google.com/generate-speech).
|
| 215 |
+
|
| 216 |
+
## Supported languages
|
| 217 |
+
|
| 218 |
+
The TTS models detect the input language automatically. They support the following 24 languages:
|
| 219 |
+
|
| 220 |
+
Language | BCP-47 Code | Language | BCP-47 Code
|
| 221 |
+
---|---|---|---
|
| 222 |
+
Arabic (Egyptian) | `ar-EG` | German (Germany) | `de-DE`
|
| 223 |
+
English (US) | `en-US` | Spanish (US) | `es-US`
|
| 224 |
+
French (France) | `fr-FR` | Hindi (India) | `hi-IN`
|
| 225 |
+
Indonesian (Indonesia) | `id-ID` | Italian (Italy) | `it-IT`
|
| 226 |
+
Japanese (Japan) | `ja-JP` | Korean (Korea) | `ko-KR`
|
| 227 |
+
Portuguese (Brazil) | `pt-BR` | Russian (Russia) | `ru-RU`
|
| 228 |
+
Dutch (Netherlands) | `nl-NL` | Polish (Poland) | `pl-PL`
|
| 229 |
+
Thai (Thailand) | `th-TH` | Turkish (Turkey) | `tr-TR`
|
| 230 |
+
Vietnamese (Vietnam) | `vi-VN` | Romanian (Romania) | `ro-RO`
|
| 231 |
+
Ukrainian (Ukraine) | `uk-UA` | Bengali (Bangladesh) | `bn-BD`
|
| 232 |
+
English (India) | `en-IN` & `hi-IN` bundle | Marathi (India) | `mr-IN`
|
| 233 |
+
Tamil (India) | `ta-IN` | Telugu (India) | `te-IN`
|
| 234 |
+
|
| 235 |
+
## Supported models
|
| 236 |
+
|
| 237 |
+
Model | Single speaker | Multispeaker
|
| 238 |
+
---|---|---
|
| 239 |
+
[Gemini 2.5 Flash Preview TTS](/gemini-api/docs/models#gemini-2.5-flash-preview-tts) | ✔️ | ✔️
|
| 240 |
+
[Gemini 2.5 Pro Preview TTS](/gemini-api/docs/models#gemini-2.5-pro-preview-tts) | ✔️ | ✔️
|
| 241 |
+
|
| 242 |
+
## Limitations
|
| 243 |
+
|
| 244 |
+
* TTS models can only receive text inputs and generate audio outputs.
|
| 245 |
+
* A TTS session has a [context window](/gemini-api/docs/long-context) limit of 32k tokens.
|
| 246 |
+
* Review [Languages](/gemini-api/docs/speech-generation#languages) section for language support.
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
## What's next
|
| 251 |
+
|
| 252 |
+
* Try the [audio generation cookbook](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_TTS.ipynb).
|
| 253 |
+
* Gemini's [Live API](/gemini-api/docs/live) offers interactive audio generation options you can interleave with other modalities.
|
| 254 |
+
* For working with audio _inputs_ , visit the [Audio understanding](/gemini-api/docs/audio) guide.
|
| 255 |
+
|
| 256 |
+
|
documentation_gemini/structured_output.md
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Structured output
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/structured-output>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database.
|
| 8 |
+
|
| 9 |
+
Gemini can generate either [JSON](/gemini-api/docs/structured-output#generating-json) or [enum values](/gemini-api/docs/structured-output#generating-enums) as structured output.
|
| 10 |
+
|
| 11 |
+
## Generating JSON
|
| 12 |
+
|
| 13 |
+
To constrain the model to generate JSON, configure a `responseSchema`. The model will then respond to any prompt with JSON-formatted output.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from google import genai
|
| 17 |
+
from pydantic import BaseModel
|
| 18 |
+
|
| 19 |
+
class Recipe(BaseModel):
|
| 20 |
+
recipe_name: str
|
| 21 |
+
ingredients: list[str]
|
| 22 |
+
|
| 23 |
+
client = genai.Client()
|
| 24 |
+
response = client.models.generate_content(
|
| 25 |
+
model="gemini-2.5-flash",
|
| 26 |
+
contents="List a few popular cookie recipes, and include the amounts of ingredients.",
|
| 27 |
+
config={
|
| 28 |
+
"response_mime_type": "application/json",
|
| 29 |
+
"response_schema": list[Recipe],
|
| 30 |
+
},
|
| 31 |
+
)
|
| 32 |
+
# Use the response as a JSON string.
|
| 33 |
+
print(response.text)
|
| 34 |
+
|
| 35 |
+
# Use instantiated objects.
|
| 36 |
+
my_recipes: list[Recipe] = response.parsed
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
**Note:** [Pydantic validators](https://docs.pydantic.dev/latest/concepts/validators/) are not yet supported. If a `pydantic.ValidationError` occurs, it is suppressed, and `.parsed` may be empty/null.
|
| 40 |
+
|
| 41 |
+
The output might look like this:
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
[
|
| 45 |
+
{
|
| 46 |
+
"recipeName": "Chocolate Chip Cookies",
|
| 47 |
+
"ingredients": [
|
| 48 |
+
"1 cup (2 sticks) unsalted butter, softened",
|
| 49 |
+
"3/4 cup granulated sugar",
|
| 50 |
+
"3/4 cup packed brown sugar",
|
| 51 |
+
"1 teaspoon vanilla extract",
|
| 52 |
+
"2 large eggs",
|
| 53 |
+
"2 1/4 cups all-purpose flour",
|
| 54 |
+
"1 teaspoon baking soda",
|
| 55 |
+
"1 teaspoon salt",
|
| 56 |
+
"2 cups chocolate chips"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
...
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
## Generating enum values
|
| 64 |
+
|
| 65 |
+
In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an _enum_ in your schema. You can use an enum option anywhere you could use a `string` in the `responseSchema`, because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application.
|
| 66 |
+
|
| 67 |
+
For example, assume that you're developing an application to classify musical instruments into one of five categories: `"Percussion"`, `"String"`, `"Woodwind"`, `"Brass"`, or "`"Keyboard"`". You could create an enum to help with this task.
|
| 68 |
+
|
| 69 |
+
In the following example, you pass an enum as the `responseSchema`, constraining the model to choose the most appropriate option.
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
from google import genai
|
| 73 |
+
import enum
|
| 74 |
+
|
| 75 |
+
class Instrument(enum.Enum):
|
| 76 |
+
PERCUSSION = "Percussion"
|
| 77 |
+
STRING = "String"
|
| 78 |
+
WOODWIND = "Woodwind"
|
| 79 |
+
BRASS = "Brass"
|
| 80 |
+
KEYBOARD = "Keyboard"
|
| 81 |
+
|
| 82 |
+
client = genai.Client()
|
| 83 |
+
response = client.models.generate_content(
|
| 84 |
+
model='gemini-2.5-flash',
|
| 85 |
+
contents='What type of instrument is an oboe?',
|
| 86 |
+
config={
|
| 87 |
+
'response_mime_type': 'text/x.enum',
|
| 88 |
+
'response_schema': Instrument,
|
| 89 |
+
},
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
print(response.text)
|
| 93 |
+
# Woodwind
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ([Schema](https://ai.google.dev/api/caching#schema)).
|
| 97 |
+
|
| 98 |
+
There are two other ways to specify an enumeration. You can use a [`Literal`](https://docs.pydantic.dev/1.10/usage/types/#literal-type): ```
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
Literal["Percussion", "String", "Woodwind", "Brass", "Keyboard"]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
And you can also pass the schema as JSON:
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
from google import genai
|
| 108 |
+
|
| 109 |
+
client = genai.Client()
|
| 110 |
+
response = client.models.generate_content(
|
| 111 |
+
model='gemini-2.5-flash',
|
| 112 |
+
contents='What type of instrument is an oboe?',
|
| 113 |
+
config={
|
| 114 |
+
'response_mime_type': 'text/x.enum',
|
| 115 |
+
'response_schema': {
|
| 116 |
+
"type": "STRING",
|
| 117 |
+
"enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"],
|
| 118 |
+
},
|
| 119 |
+
},
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
print(response.text)
|
| 123 |
+
# Woodwind
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a `Grade` enum to give each title a popularity grade:
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
from google import genai
|
| 130 |
+
|
| 131 |
+
import enum
|
| 132 |
+
from pydantic import BaseModel
|
| 133 |
+
|
| 134 |
+
class Grade(enum.Enum):
|
| 135 |
+
A_PLUS = "a+"
|
| 136 |
+
A = "a"
|
| 137 |
+
B = "b"
|
| 138 |
+
C = "c"
|
| 139 |
+
D = "d"
|
| 140 |
+
F = "f"
|
| 141 |
+
|
| 142 |
+
class Recipe(BaseModel):
|
| 143 |
+
recipe_name: str
|
| 144 |
+
rating: Grade
|
| 145 |
+
|
| 146 |
+
client = genai.Client()
|
| 147 |
+
response = client.models.generate_content(
|
| 148 |
+
model='gemini-2.5-flash',
|
| 149 |
+
contents='List 10 home-baked cookie recipes and give them grades based on tastiness.',
|
| 150 |
+
config={
|
| 151 |
+
'response_mime_type': 'application/json',
|
| 152 |
+
'response_schema': list[Recipe],
|
| 153 |
+
},
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
print(response.text)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
The response might look like this:
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
[
|
| 163 |
+
{
|
| 164 |
+
"recipe_name": "Chocolate Chip Cookies",
|
| 165 |
+
"rating": "a+"
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"recipe_name": "Peanut Butter Cookies",
|
| 169 |
+
"rating": "a"
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"recipe_name": "Oatmeal Raisin Cookies",
|
| 173 |
+
"rating": "b"
|
| 174 |
+
},
|
| 175 |
+
...
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
## About JSON schemas
|
| 180 |
+
|
| 181 |
+
Configuring the model for JSON output using `responseSchema` parameter relies on `Schema` object to define its structure. This object represents a select subset of the [OpenAPI 3.0 Schema object](https://spec.openapis.org/oas/v3.0.3#schema-object), and also adds a `propertyOrdering` field.
|
| 182 |
+
|
| 183 |
+
**Tip:** On Python, when you use a Pydantic model, you don't need to directly work with `Schema` objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python.
|
| 184 |
+
|
| 185 |
+
Here's a pseudo-JSON representation of all the `Schema` fields:
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
{
|
| 189 |
+
"type": enum (Type),
|
| 190 |
+
"format": string,
|
| 191 |
+
"description": string,
|
| 192 |
+
"nullable": boolean,
|
| 193 |
+
"enum": [
|
| 194 |
+
string
|
| 195 |
+
],
|
| 196 |
+
"maxItems": integer,
|
| 197 |
+
"minItems": integer,
|
| 198 |
+
"properties": {
|
| 199 |
+
string: {
|
| 200 |
+
object (Schema)
|
| 201 |
+
},
|
| 202 |
+
...
|
| 203 |
+
},
|
| 204 |
+
"required": [
|
| 205 |
+
string
|
| 206 |
+
],
|
| 207 |
+
"propertyOrdering": [
|
| 208 |
+
string
|
| 209 |
+
],
|
| 210 |
+
"items": {
|
| 211 |
+
object (Schema)
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
The `Type` of the schema must be one of the OpenAPI [Data Types](https://spec.openapis.org/oas/v3.0.3#data-types), or a union of those types (using `anyOf`). Only a subset of fields is valid for each `Type`. The following list maps each `Type` to a subset of the fields that are valid for that type:
|
| 217 |
+
|
| 218 |
+
* `string` -> `enum`, `format`, `nullable`
|
| 219 |
+
* `integer` -> `format`, `minimum`, `maximum`, `enum`, `nullable`
|
| 220 |
+
* `number` -> `format`, `minimum`, `maximum`, `enum`, `nullable`
|
| 221 |
+
* `boolean` -> `nullable`
|
| 222 |
+
* `array` -> `minItems`, `maxItems`, `items`, `nullable`
|
| 223 |
+
* `object` -> `properties`, `required`, `propertyOrdering`, `nullable`
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
Here are some example schemas showing valid type-and-field combinations:
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
{ "type": "string", "enum": ["a", "b", "c"] }
|
| 231 |
+
|
| 232 |
+
{ "type": "string", "format": "date-time" }
|
| 233 |
+
|
| 234 |
+
{ "type": "integer", "format": "int64" }
|
| 235 |
+
|
| 236 |
+
{ "type": "number", "format": "double" }
|
| 237 |
+
|
| 238 |
+
{ "type": "boolean" }
|
| 239 |
+
|
| 240 |
+
{ "type": "array", "minItems": 3, "maxItems": 3, "items": { "type": ... } }
|
| 241 |
+
|
| 242 |
+
{ "type": "object",
|
| 243 |
+
"properties": {
|
| 244 |
+
"a": { "type": ... },
|
| 245 |
+
"b": { "type": ... },
|
| 246 |
+
"c": { "type": ... }
|
| 247 |
+
},
|
| 248 |
+
"nullable": true,
|
| 249 |
+
"required": ["c"],
|
| 250 |
+
"propertyOrdering": ["c", "b", "a"]
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
For complete documentation of the Schema fields as they're used in the Gemini API, see the [Schema reference](/api/caching#Schema).
|
| 255 |
+
|
| 256 |
+
### Property ordering
|
| 257 |
+
|
| 258 |
+
**Warning:** When you're configuring a JSON schema, make sure to set `propertyOrdering[]`, and when you provide examples, make sure that the property ordering in the examples matches the schema.
|
| 259 |
+
|
| 260 |
+
When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the [Google Gen AI SDKs](/gemini-api/docs/sdks) may preserve this order). If you're providing examples to the model with a schema configured, and the property ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected.
|
| 261 |
+
|
| 262 |
+
To ensure a consistent, predictable ordering of properties, you can use the optional `propertyOrdering[]` field.
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
"propertyOrdering": ["recipeName", "ingredients"]
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
`propertyOrdering[]` – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. `propertyOrdering` is only supported when you manually create `types.Schema`.
|
| 269 |
+
|
| 270 |
+
### Schemas in Python
|
| 271 |
+
|
| 272 |
+
When you're using the Python library, the value of `response_schema` must be one of the following:
|
| 273 |
+
|
| 274 |
+
* A type, as you would use in a type annotation (see the Python [`typing` module](https://docs.python.org/3/library/typing.html))
|
| 275 |
+
* An instance of [`genai.types.Schema`](https://googleapis.github.io/python-genai/genai.html#genai.types.Schema)
|
| 276 |
+
* The `dict` equivalent of `genai.types.Schema`
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
The easiest way to define a schema is with a Pydantic type (as shown in the previous example):
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
config={'response_mime_type': 'application/json',
|
| 284 |
+
'response_schema': list[Recipe]}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the [Python library docs](https://googleapis.github.io/python-genai/index.html#json-response-schema).
|
| 288 |
+
|
| 289 |
+
The Python library supports schemas defined with the following types (where `AllowedType` is any allowed type):
|
| 290 |
+
|
| 291 |
+
* `int`
|
| 292 |
+
* `float`
|
| 293 |
+
* `bool`
|
| 294 |
+
* `str`
|
| 295 |
+
* `list[AllowedType]`
|
| 296 |
+
* `AllowedType|AllowedType|...`
|
| 297 |
+
* For structured types:
|
| 298 |
+
* `dict[str, AllowedType]`. This annotation declares all dict values to be the same type, but doesn't specify what keys should be included.
|
| 299 |
+
* User-defined [Pydantic models](https://docs.pydantic.dev/latest/concepts/models/). This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures.
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
### JSON Schema support
|
| 304 |
+
|
| 305 |
+
[JSON Schema](https://json-schema.org/) is a more recent specification than OpenAPI 3.0, which the [Schema](/api/caching#Schema) object is based on. Support for JSON Schema is available as a preview using the field [`responseJsonSchema`](/api/generate-content#FIELDS.response_json_schema) which accepts any JSON Schema with the following limitations:
|
| 306 |
+
|
| 307 |
+
* It only works with Gemini 2.5.
|
| 308 |
+
* While all JSON Schema properties can be passed, not all are supported. See the [documentation](/api/generate-content#FIELDS.response_json_schema) for the field for more details.
|
| 309 |
+
* Recursive references can only be used as the value of a non-required object property.
|
| 310 |
+
* Recursive references are unrolled to a finite degree, based on the size of the schema.
|
| 311 |
+
* Schemas that contain `$ref` cannot contain any properties other than those starting with a `$`.
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
Here's an example of generating a JSON Schema with Pydantic and submitting it to the model:
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
curl "https://generativelanguage.googleapis.com/v1alpha/models/\
|
| 319 |
+
gemini-2.5-flash:generateContent" \
|
| 320 |
+
-H "x-goog-api-key: $GEMINI_API_KEY"\
|
| 321 |
+
-H 'Content-Type: application/json' \
|
| 322 |
+
-d @- <<EOF
|
| 323 |
+
{
|
| 324 |
+
"contents": [{
|
| 325 |
+
"parts":[{
|
| 326 |
+
"text": "Please give a random example following this schema"
|
| 327 |
+
}]
|
| 328 |
+
}],
|
| 329 |
+
"generationConfig": {
|
| 330 |
+
"response_mime_type": "application/json",
|
| 331 |
+
"response_json_schema": $(python3 - << PYEOF
|
| 332 |
+
from enum import Enum
|
| 333 |
+
from typing import List, Optional, Union, Set
|
| 334 |
+
from pydantic import BaseModel, Field, ConfigDict
|
| 335 |
+
import json
|
| 336 |
+
|
| 337 |
+
class UserRole(str, Enum):
|
| 338 |
+
ADMIN = "admin"
|
| 339 |
+
VIEWER = "viewer"
|
| 340 |
+
|
| 341 |
+
class Address(BaseModel):
|
| 342 |
+
street: str
|
| 343 |
+
city: str
|
| 344 |
+
|
| 345 |
+
class UserProfile(BaseModel):
|
| 346 |
+
username: str = Field(description="User's unique name")
|
| 347 |
+
age: Optional[int] = Field(ge=0, le=120)
|
| 348 |
+
roles: Set[UserRole] = Field(min_items=1)
|
| 349 |
+
contact: Union[Address, str]
|
| 350 |
+
model_config = ConfigDict(title="User Schema")
|
| 351 |
+
|
| 352 |
+
# Generate and print the JSON Schema
|
| 353 |
+
print(json.dumps(UserProfile.model_json_schema(), indent=2))
|
| 354 |
+
PYEOF
|
| 355 |
+
)
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
EOF
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
Passing JSON Schema directly is not yet supported when using the SDK.
|
| 362 |
+
|
| 363 |
+
## Best practices
|
| 364 |
+
|
| 365 |
+
Keep the following considerations and best practices in mind when you're using a response schema:
|
| 366 |
+
|
| 367 |
+
* The size of your response schema counts towards the input token limit.
|
| 368 |
+
* By default, fields are optional, meaning the model can populate the fields or skip them. You can set fields as required to force the model to provide a value. If there's insufficient context in the associated input prompt, the model generates responses mainly based on the data it was trained on.
|
| 369 |
+
* A complex schema can result in an `InvalidArgument: 400` error. Complexity might come from long property names, long array length limits, enums with many values, objects with lots of optional properties, or a combination of these factors.
|
| 370 |
+
|
| 371 |
+
If you get this error with a valid schema, make one or more of the following changes to resolve the error:
|
| 372 |
+
|
| 373 |
+
* Shorten property names or enum names.
|
| 374 |
+
* Flatten nested arrays.
|
| 375 |
+
* Reduce the number of properties with constraints, such as numbers with minimum and maximum limits.
|
| 376 |
+
* Reduce the number of properties with complex constraints, such as properties with complex formats like `date-time`.
|
| 377 |
+
* Reduce the number of optional properties.
|
| 378 |
+
* Reduce the number of valid values for enums.
|
| 379 |
+
* If you aren't seeing the results you expect, add more context to your input prompts or revise your response schema. For example, review the model's response without structured output to see how the model responds. You can then update your response schema so that it better fits the model's output. For additional troubleshooting tips on structured output, see the [troubleshooting guide](/gemini-api/docs/troubleshooting#repetitive-tokens).
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
## What's next
|
| 385 |
+
|
| 386 |
+
Now that you've learned how to generate structured output, you might want to try using Gemini API tools:
|
| 387 |
+
|
| 388 |
+
* [Function calling](/gemini-api/docs/function-calling)
|
| 389 |
+
* [Code execution](/gemini-api/docs/code-execution)
|
| 390 |
+
* [Grounding with Google Search](/gemini-api/docs/grounding)
|
| 391 |
+
|
| 392 |
+
|
documentation_gemini/text_generation.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Text generation
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/text-generation>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models.
|
| 8 |
+
|
| 9 |
+
Here's a basic example that takes a single text input:
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
from google import genai
|
| 13 |
+
|
| 14 |
+
client = genai.Client()
|
| 15 |
+
|
| 16 |
+
response = client.models.generate_content(
|
| 17 |
+
model="gemini-2.5-flash",
|
| 18 |
+
contents="How does AI work?"
|
| 19 |
+
)
|
| 20 |
+
print(response.text)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
## Thinking with Gemini 2.5
|
| 24 |
+
|
| 25 |
+
2.5 Flash and Pro models have ["thinking"](/gemini-api/docs/thinking) enabled by default to enhance quality, which may take longer to run and increase token usage.
|
| 26 |
+
|
| 27 |
+
When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero.
|
| 28 |
+
|
| 29 |
+
For more details, see the [thinking guide](/gemini-api/docs/thinking#set-budget).
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
from google import genai
|
| 33 |
+
from google.genai import types
|
| 34 |
+
|
| 35 |
+
client = genai.Client()
|
| 36 |
+
|
| 37 |
+
response = client.models.generate_content(
|
| 38 |
+
model="gemini-2.5-flash",
|
| 39 |
+
contents="How does AI work?",
|
| 40 |
+
config=types.GenerateContentConfig(
|
| 41 |
+
thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking
|
| 42 |
+
),
|
| 43 |
+
)
|
| 44 |
+
print(response.text)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
## System instructions and other configurations
|
| 48 |
+
|
| 49 |
+
You can guide the behavior of Gemini models with system instructions. To do so, pass a [`GenerateContentConfig`](/api/generate-content#v1beta.GenerationConfig) object.
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
from google import genai
|
| 53 |
+
from google.genai import types
|
| 54 |
+
|
| 55 |
+
client = genai.Client()
|
| 56 |
+
|
| 57 |
+
response = client.models.generate_content(
|
| 58 |
+
model="gemini-2.5-flash",
|
| 59 |
+
config=types.GenerateContentConfig(
|
| 60 |
+
system_instruction="You are a cat. Your name is Neko."),
|
| 61 |
+
contents="Hello there"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
print(response.text)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
The [`GenerateContentConfig`](/api/generate-content#v1beta.GenerationConfig) object also lets you override default generation parameters, such as [temperature](/api/generate-content#v1beta.GenerationConfig).
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
from google import genai
|
| 71 |
+
from google.genai import types
|
| 72 |
+
|
| 73 |
+
client = genai.Client()
|
| 74 |
+
|
| 75 |
+
response = client.models.generate_content(
|
| 76 |
+
model="gemini-2.5-flash",
|
| 77 |
+
contents=["Explain how AI works"],
|
| 78 |
+
config=types.GenerateContentConfig(
|
| 79 |
+
temperature=0.1
|
| 80 |
+
)
|
| 81 |
+
)
|
| 82 |
+
print(response.text)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
Refer to the [`GenerateContentConfig`](/api/generate-content#v1beta.GenerationConfig) in our API reference for a complete list of configurable parameters and their descriptions.
|
| 86 |
+
|
| 87 |
+
## Multimodal inputs
|
| 88 |
+
|
| 89 |
+
The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image:
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
from PIL import Image
|
| 93 |
+
from google import genai
|
| 94 |
+
|
| 95 |
+
client = genai.Client()
|
| 96 |
+
|
| 97 |
+
image = Image.open("/path/to/organ.png")
|
| 98 |
+
response = client.models.generate_content(
|
| 99 |
+
model="gemini-2.5-flash",
|
| 100 |
+
contents=[image, "Tell me about this instrument"]
|
| 101 |
+
)
|
| 102 |
+
print(response.text)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
For alternative methods of providing images and more advanced image processing, see our [image understanding guide](/gemini-api/docs/image-understanding). The API also supports [document](/gemini-api/docs/document-processing), [video](/gemini-api/docs/video-understanding), and [audio](/gemini-api/docs/audio) inputs and understanding.
|
| 106 |
+
|
| 107 |
+
## Streaming responses
|
| 108 |
+
|
| 109 |
+
By default, the model returns a response only after the entire generation process is complete.
|
| 110 |
+
|
| 111 |
+
For more fluid interactions, use streaming to receive [`GenerateContentResponse`](/api/generate-content#v1beta.GenerateContentResponse) instances incrementally as they're generated.
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
from google import genai
|
| 115 |
+
|
| 116 |
+
client = genai.Client()
|
| 117 |
+
|
| 118 |
+
response = client.models.generate_content_stream(
|
| 119 |
+
model="gemini-2.5-flash",
|
| 120 |
+
contents=["Explain how AI works"]
|
| 121 |
+
)
|
| 122 |
+
for chunk in response:
|
| 123 |
+
print(chunk.text, end="")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
## Multi-turn conversations (Chat)
|
| 127 |
+
|
| 128 |
+
Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history.
|
| 129 |
+
|
| 130 |
+
**Note:** Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the [`generateContent`](/api/generate-content#method:-models.generatecontent) API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn.
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
from google import genai
|
| 134 |
+
|
| 135 |
+
client = genai.Client()
|
| 136 |
+
chat = client.chats.create(model="gemini-2.5-flash")
|
| 137 |
+
|
| 138 |
+
response = chat.send_message("I have 2 dogs in my house.")
|
| 139 |
+
print(response.text)
|
| 140 |
+
|
| 141 |
+
response = chat.send_message("How many paws are in my house?")
|
| 142 |
+
print(response.text)
|
| 143 |
+
|
| 144 |
+
for message in chat.get_history():
|
| 145 |
+
print(f'role - {message.role}',end=": ")
|
| 146 |
+
print(message.parts[0].text)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
Streaming can also be used for multi-turn conversations.
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
from google import genai
|
| 153 |
+
|
| 154 |
+
client = genai.Client()
|
| 155 |
+
chat = client.chats.create(model="gemini-2.5-flash")
|
| 156 |
+
|
| 157 |
+
response = chat.send_message_stream("I have 2 dogs in my house.")
|
| 158 |
+
for chunk in response:
|
| 159 |
+
print(chunk.text, end="")
|
| 160 |
+
|
| 161 |
+
response = chat.send_message_stream("How many paws are in my house?")
|
| 162 |
+
for chunk in response:
|
| 163 |
+
print(chunk.text, end="")
|
| 164 |
+
|
| 165 |
+
for message in chat.get_history():
|
| 166 |
+
print(f'role - {message.role}', end=": ")
|
| 167 |
+
print(message.parts[0].text)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
## Supported models
|
| 171 |
+
|
| 172 |
+
All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the [Models](/gemini-api/docs/models) page.
|
| 173 |
+
|
| 174 |
+
## Best practices
|
| 175 |
+
|
| 176 |
+
### Prompting tips
|
| 177 |
+
|
| 178 |
+
For basic text generation, a [zero-shot](/gemini-api/docs/prompting-strategies#few-shot) prompt often suffices without needing examples, system instructions or specific formatting.
|
| 179 |
+
|
| 180 |
+
For more tailored outputs:
|
| 181 |
+
|
| 182 |
+
* Use System instructions to guide the model.
|
| 183 |
+
* Provide few example inputs and outputs to guide the model. This is often referred to as [few-shot](/gemini-api/docs/prompting-strategies#few-shot) prompting.
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
Consult our [prompt engineering guide](/gemini/docs/prompting-strategies) for more tips.
|
| 188 |
+
|
| 189 |
+
### Structured output
|
| 190 |
+
|
| 191 |
+
In some cases, you may need structured output, such as JSON. Refer to our [structured output](/gemini-api/docs/structured-output) guide to learn how.
|
| 192 |
+
|
| 193 |
+
## What's next
|
| 194 |
+
|
| 195 |
+
* Try the [Gemini API getting started Colab](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started.ipynb).
|
| 196 |
+
* Explore Gemini's [image](/gemini-api/docs/image-understanding), [video](/gemini-api/docs/video-understanding), [audio](/gemini-api/docs/audio) and [document](/gemini-api/docs/document-processing) understanding capabilities.
|
| 197 |
+
* Learn about multimodal [file prompting strategies](/gemini-api/docs/files#prompt-guide).
|
| 198 |
+
|
| 199 |
+
|
documentation_gemini/url_context.md
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# URL context
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/url-context>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
The URL context tool lets you provide additional context to the models in the form of URLs. By including URLs in your request, the model will access the content from those pages (as long as it's not a URL type listed in the limitations section) to inform and enhance its response.
|
| 8 |
+
|
| 9 |
+
The URL context tool is useful for tasks like the following:
|
| 10 |
+
|
| 11 |
+
* **Extract Data** : Pull specific info like prices, names, or key findings from multiple URLs.
|
| 12 |
+
* **Compare Documents** : Analyze multiple reports, articles, or PDFs to identify differences and track trends.
|
| 13 |
+
* **Synthesize & Create Content**: Combine information from several source URLs to generate accurate summaries, blog posts, or reports.
|
| 14 |
+
* **Analyze Code & Docs**: Point to a GitHub repository or technical documentation to explain code, generate setup instructions, or answer questions.
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
The following example shows how to compare two recipes from different websites.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
from google import genai
|
| 22 |
+
from google.genai.types import Tool, GenerateContentConfig
|
| 23 |
+
|
| 24 |
+
client = genai.Client()
|
| 25 |
+
model_id = "gemini-2.5-flash"
|
| 26 |
+
|
| 27 |
+
tools = [
|
| 28 |
+
{"url_context": {}},
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
url1 = "https://www.foodnetwork.com/recipes/ina-garten/perfect-roast-chicken-recipe-1940592"
|
| 32 |
+
url2 = "https://www.allrecipes.com/recipe/21151/simple-whole-roast-chicken/"
|
| 33 |
+
|
| 34 |
+
response = client.models.generate_content(
|
| 35 |
+
model=model_id,
|
| 36 |
+
contents=f"Compare the ingredients and cooking times from the recipes at {url1} and {url2}",
|
| 37 |
+
config=GenerateContentConfig(
|
| 38 |
+
tools=tools,
|
| 39 |
+
)
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
for each in response.candidates[0].content.parts:
|
| 43 |
+
print(each.text)
|
| 44 |
+
|
| 45 |
+
# For verification, you can inspect the metadata to see which URLs the model retrieved
|
| 46 |
+
print(response.candidates[0].url_context_metadata)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
## How it works
|
| 50 |
+
|
| 51 |
+
The URL Context tool uses a two-step retrieval process to balance speed, cost, and access to fresh data. When you provide a URL, the tool first attempts to fetch the content from an internal index cache. This acts as a highly optimized cache. If a URL is not available in the index (for example, if it's a very new page), the tool automatically falls back to do a live fetch. This directly accesses the URL to retrieve its content in real-time.
|
| 52 |
+
|
| 53 |
+
## Combining with other tools
|
| 54 |
+
|
| 55 |
+
You can combine the URL context tool with other tools to create more powerful workflows.
|
| 56 |
+
|
| 57 |
+
### Grounding with search
|
| 58 |
+
|
| 59 |
+
When both URL context and [Grounding with Google Search](/gemini-api/docs/grounding) are enabled, the model can use its search capabilities to find relevant information online and then use the URL context tool to get a more in-depth understanding of the pages it finds. This approach is powerful for prompts that require both broad searching and deep analysis of specific pages.
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
from google import genai
|
| 63 |
+
from google.genai.types import Tool, GenerateContentConfig, GoogleSearch, UrlContext
|
| 64 |
+
|
| 65 |
+
client = genai.Client()
|
| 66 |
+
model_id = "gemini-2.5-flash"
|
| 67 |
+
|
| 68 |
+
tools = [
|
| 69 |
+
{"url_context": {}},
|
| 70 |
+
{"google_search": {}}
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
response = client.models.generate_content(
|
| 74 |
+
model=model_id,
|
| 75 |
+
contents="Give me three day events schedule based on YOUR_URL. Also let me know what needs to taken care of considering weather and commute.",
|
| 76 |
+
config=GenerateContentConfig(
|
| 77 |
+
tools=tools,
|
| 78 |
+
)
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
for each in response.candidates[0].content.parts:
|
| 82 |
+
print(each.text)
|
| 83 |
+
# get URLs retrieved for context
|
| 84 |
+
print(response.candidates[0].url_context_metadata)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
## Understanding the response
|
| 89 |
+
|
| 90 |
+
When the model uses the URL context tool, the response includes a `url_context_metadata` object. This object lists the URLs the model retrieved content from and the status of each retrieval attempt, which is useful for verification and debugging.
|
| 91 |
+
|
| 92 |
+
The following is an example of that part of the response (parts of the response have been omitted for brevity):
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
{
|
| 96 |
+
"candidates": [
|
| 97 |
+
{
|
| 98 |
+
"content": {
|
| 99 |
+
"parts": [
|
| 100 |
+
{
|
| 101 |
+
"text": "... \n"
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
+
"role": "model"
|
| 105 |
+
},
|
| 106 |
+
...
|
| 107 |
+
"url_context_metadata": {
|
| 108 |
+
"url_metadata": [
|
| 109 |
+
{
|
| 110 |
+
"retrieved_url": "https://www.foodnetwork.com/recipes/ina-garten/perfect-roast-chicken-recipe-1940592",
|
| 111 |
+
"url_retrieval_status": "URL_RETRIEVAL_STATUS_SUCCESS"
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"retrieved_url": "https://www.allrecipes.com/recipe/21151/simple-whole-roast-chicken/",
|
| 115 |
+
"url_retrieval_status": "URL_RETRIEVAL_STATUS_SUCCESS"
|
| 116 |
+
}
|
| 117 |
+
]
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
For complete detail about this object , see the [`UrlContextMetadata` API reference](/api/generate-content#UrlContextMetadata).
|
| 124 |
+
|
| 125 |
+
### Safety checks
|
| 126 |
+
|
| 127 |
+
The system performs a content moderation check on the URL to confirm they meet safety standards. If the URL you provided fails this check, you will get an `url_retrieval_status` of `URL_RETRIEVAL_STATUS_UNSAFE`.
|
| 128 |
+
|
| 129 |
+
### Token count
|
| 130 |
+
|
| 131 |
+
The content retrieved from the URLs you specify in your prompt is counted as part of the input tokens. You can see the token count for your prompt and tools usage in the [`usage_metadata`](/api/generate-content#UsageMetadata) object of the model output. The following is an example output:
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
'usage_metadata': {
|
| 135 |
+
'candidates_token_count': 45,
|
| 136 |
+
'prompt_token_count': 27,
|
| 137 |
+
'prompt_tokens_details': [{'modality': <MediaModality.TEXT: 'TEXT'>,
|
| 138 |
+
'token_count': 27}],
|
| 139 |
+
'thoughts_token_count': 31,
|
| 140 |
+
**'tool_use_prompt_token_count': 10309,**
|
| 141 |
+
'tool_use_prompt_tokens_details': [{'modality': <MediaModality.TEXT: 'TEXT'>,
|
| 142 |
+
'token_count': 10309}],
|
| 143 |
+
'total_token_count': 10412
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
Price per token depends on the model used, see the [pricing](/gemini-api/docs/pricing) page for details.
|
| 148 |
+
|
| 149 |
+
## Supported models
|
| 150 |
+
|
| 151 |
+
* [gemini-2.5-pro](/gemini-api/docs/models#gemini-2.5-pro)
|
| 152 |
+
* [gemini-2.5-flash](/gemini-api/docs/models#gemini-2.5-flash)
|
| 153 |
+
* [gemini-2.5-flash-lite](/gemini-api/docs/models#gemini-2.5-flash-lite)
|
| 154 |
+
* [gemini-live-2.5-flash-preview](/gemini-api/docs/models#live-api)
|
| 155 |
+
* [gemini-2.0-flash-live-001](/gemini-api/docs/models#live-api-2.0)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
## Best Practices
|
| 160 |
+
|
| 161 |
+
* **Provide specific URLs** : For the best results, provide direct URLs to the content you want the model to analyze. The model will only retrieve content from the URLs you provide, not any content from nested links.
|
| 162 |
+
* **Check for accessibility** : Verify that the URLs you provide don't lead to pages that require a login or are behind a paywall.
|
| 163 |
+
* **Use the complete URL** : Provide the full URL, including the protocol (e.g., https://www.google.com instead of just google.com).
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## Limitations
|
| 168 |
+
|
| 169 |
+
* **Pricing** : Content retrieved from URLs counts as input tokens. Rate limit and pricing is the based on the model used. See the [rate limits](/gemini-api/docs/rate-limits) and [pricing](/gemini-api/docs/pricing) pages for details.
|
| 170 |
+
* **Request limit** : The tool can process up to 20 URLs per request.
|
| 171 |
+
* **URL content size** : The maximum size for content retrieved from a single URL is 34MB.
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
### Supported and unsupported content types
|
| 176 |
+
|
| 177 |
+
The tool can extract content from URLs with the following content types:
|
| 178 |
+
|
| 179 |
+
* Text (text/html, application/json, text/plain, text/xml, text/css, text/javascript , text/csv, text/rtf)
|
| 180 |
+
* Image (image/png, image/jpeg, image/bmp, image/webp)
|
| 181 |
+
* PDF (application/pdf)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
The following content types are **not** supported:
|
| 186 |
+
|
| 187 |
+
* Paywalled content
|
| 188 |
+
* YouTube videos (See [video understanding](/gemini-api/docs/video-understanding#youtube) to learn how to process YouTube URLs)
|
| 189 |
+
* Google workspace files like Google docs or spreadsheets
|
| 190 |
+
* Video and audio files
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
## What's next
|
| 195 |
+
|
| 196 |
+
* Explore the [URL context cookbook](https://colab.sandbox.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Grounding.ipynb#url-context) for more examples.
|
| 197 |
+
|
| 198 |
+
|
documentation_gemini/video_understanding.md
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Video understanding
|
| 2 |
+
|
| 3 |
+
Source: <https://ai.google.dev/gemini-api/docs/video-understanding>
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to:
|
| 8 |
+
|
| 9 |
+
* Describe, segment, and extract information from videos
|
| 10 |
+
* Answer questions about video content
|
| 11 |
+
* Refer to specific timestamps within a video
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs.
|
| 16 |
+
|
| 17 |
+
## Video input
|
| 18 |
+
|
| 19 |
+
You can provide videos as input to Gemini in the following ways:
|
| 20 |
+
|
| 21 |
+
* Upload a video file using the File API before making a request to `generateContent`. Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests.
|
| 22 |
+
* Pass inline video data with the request to `generateContent`. Use this method for smaller files (<20MB) and shorter durations.
|
| 23 |
+
* Include a YouTube URL directly in the prompt.
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
### Upload a video file
|
| 28 |
+
|
| 29 |
+
You can use the [Files API](/gemini-api/docs/files) to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly.
|
| 30 |
+
|
| 31 |
+
The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a `generateContent` request.
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
from google import genai
|
| 35 |
+
|
| 36 |
+
client = genai.Client()
|
| 37 |
+
|
| 38 |
+
myfile = client.files.upload(file="path/to/sample.mp4")
|
| 39 |
+
|
| 40 |
+
response = client.models.generate_content(
|
| 41 |
+
model="gemini-2.5-flash", contents=[myfile, "Summarize this video. Then create a quiz with an answer key based on the information in this video."]
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
print(response.text)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
To learn more about working with media files, see [Files API](/gemini-api/docs/files).
|
| 48 |
+
|
| 49 |
+
### Pass video data inline
|
| 50 |
+
|
| 51 |
+
Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to `generateContent`. This is suitable for shorter videos under 20MB total request size.
|
| 52 |
+
|
| 53 |
+
Here's an example of providing inline video data:
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# Only for videos of size <20Mb
|
| 57 |
+
video_file_name = "/path/to/your/video.mp4"
|
| 58 |
+
video_bytes = open(video_file_name, 'rb').read()
|
| 59 |
+
|
| 60 |
+
response = client.models.generate_content(
|
| 61 |
+
model='models/gemini-2.5-flash',
|
| 62 |
+
contents=types.Content(
|
| 63 |
+
parts=[
|
| 64 |
+
types.Part(
|
| 65 |
+
inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')
|
| 66 |
+
),
|
| 67 |
+
types.Part(text='Please summarize the video in 3 sentences.')
|
| 68 |
+
]
|
| 69 |
+
)
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
### Include a YouTube URL
|
| 74 |
+
|
| 75 |
+
**Preview:** The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change.
|
| 76 |
+
|
| 77 |
+
The Gemini API and AI Studio support YouTube URLs as a file data `Part`. You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content.
|
| 78 |
+
|
| 79 |
+
**Limitations:**
|
| 80 |
+
|
| 81 |
+
* For the free tier, you can't upload more than 8 hours of YouTube video per day.
|
| 82 |
+
* For the paid tier, there is no limit based on video length.
|
| 83 |
+
* For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request.
|
| 84 |
+
* You can only upload public videos (not private or unlisted videos).
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
The following example shows how to include a YouTube URL with a prompt:
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
response = client.models.generate_content(
|
| 92 |
+
model='models/gemini-2.5-flash',
|
| 93 |
+
contents=types.Content(
|
| 94 |
+
parts=[
|
| 95 |
+
types.Part(
|
| 96 |
+
file_data=types.FileData(file_uri='https://www.youtube.com/watch?v=9hE5-98ZeCg')
|
| 97 |
+
),
|
| 98 |
+
types.Part(text='Please summarize the video in 3 sentences.')
|
| 99 |
+
]
|
| 100 |
+
)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
## Refer to timestamps in the content
|
| 105 |
+
|
| 106 |
+
You can ask questions about specific points in time within the video using timestamps of the form `MM:SS`.
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
## Transcribe video and provide visual descriptions
|
| 113 |
+
|
| 114 |
+
The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of **1 frame per second**. This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals.
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions."
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
## Customize video processing
|
| 121 |
+
|
| 122 |
+
You can customize video processing in the Gemini API by setting clipping intervals or providing custom frame rate sampling.
|
| 123 |
+
|
| 124 |
+
**Tip:** Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models.
|
| 125 |
+
|
| 126 |
+
### Set clipping intervals
|
| 127 |
+
|
| 128 |
+
You can clip video by specifying `videoMetadata` with start and end offsets.
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
response = client.models.generate_content(
|
| 132 |
+
model='models/gemini-2.5-flash',
|
| 133 |
+
contents=types.Content(
|
| 134 |
+
parts=[
|
| 135 |
+
types.Part(
|
| 136 |
+
file_data=types.FileData(file_uri='https://www.youtube.com/watch?v=XEzRZ35urlk'),
|
| 137 |
+
video_metadata=types.VideoMetadata(
|
| 138 |
+
start_offset='1250s',
|
| 139 |
+
end_offset='1570s'
|
| 140 |
+
)
|
| 141 |
+
),
|
| 142 |
+
types.Part(text='Please summarize the video in 3 sentences.')
|
| 143 |
+
]
|
| 144 |
+
)
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
### Set a custom frame rate
|
| 149 |
+
|
| 150 |
+
You can set custom frame rate sampling by passing an `fps` argument to `videoMetadata`.
|
| 151 |
+
|
| 152 |
+
**Note:** Due to built-in per image based safety checks, the same video may get blocked at some fps and not at others due to different extracted frames.
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# Only for videos of size <20Mb
|
| 156 |
+
video_file_name = "/path/to/your/video.mp4"
|
| 157 |
+
video_bytes = open(video_file_name, 'rb').read()
|
| 158 |
+
|
| 159 |
+
response = client.models.generate_content(
|
| 160 |
+
model='models/gemini-2.5-flash',
|
| 161 |
+
contents=types.Content(
|
| 162 |
+
parts=[
|
| 163 |
+
types.Part(
|
| 164 |
+
inline_data=types.Blob(
|
| 165 |
+
data=video_bytes,
|
| 166 |
+
mime_type='video/mp4'),
|
| 167 |
+
video_metadata=types.VideoMetadata(fps=5)
|
| 168 |
+
),
|
| 169 |
+
types.Part(text='Please summarize the video in 3 sentences.')
|
| 170 |
+
]
|
| 171 |
+
)
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value.
|
| 176 |
+
|
| 177 |
+
## Supported video formats
|
| 178 |
+
|
| 179 |
+
Gemini supports the following video format MIME types:
|
| 180 |
+
|
| 181 |
+
* `video/mp4`
|
| 182 |
+
* `video/mpeg`
|
| 183 |
+
* `video/mov`
|
| 184 |
+
* `video/avi`
|
| 185 |
+
* `video/x-flv`
|
| 186 |
+
* `video/mpg`
|
| 187 |
+
* `video/webm`
|
| 188 |
+
* `video/wmv`
|
| 189 |
+
* `video/3gpp`
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
## Technical details about videos
|
| 194 |
+
|
| 195 |
+
* **Supported models & context**: All Gemini 2.0 and 2.5 models can process video data.
|
| 196 |
+
* Models with a 2M context window can process videos up to 2 hours long at default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution.
|
| 197 |
+
* **File API processing** : When using the File API, videos are stored at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second.
|
| 198 |
+
* These rates are subject to change in the future for improvements in inference.
|
| 199 |
+
* You can override the 1 FPS sampling rate by setting a custom frame rate.
|
| 200 |
+
* **Token calculation** : Each second of video is tokenized as follows:
|
| 201 |
+
* Individual frames (sampled at 1 FPS):
|
| 202 |
+
* If [`mediaResolution`](/api/generate-content#MediaResolution) is set to low, frames are tokenized at 66 tokens per frame.
|
| 203 |
+
* Otherwise, frames are tokenized at 258 tokens per frame.
|
| 204 |
+
* Audio: 32 tokens per second.
|
| 205 |
+
* Metadata is also included.
|
| 206 |
+
* Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution.
|
| 207 |
+
* **Timestamp format** : When referring to specific moments in a video within your prompt, use the `MM:SS` format (e.g., `01:15` for 1 minute and 15 seconds).
|
| 208 |
+
* **Best practices** :
|
| 209 |
+
* Use only one video per prompt request for optimal results.
|
| 210 |
+
* If combining text and a single video, place the text prompt _after_ the video part in the `contents` array.
|
| 211 |
+
* Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary.
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
## What's next
|
| 216 |
+
|
| 217 |
+
This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources:
|
| 218 |
+
|
| 219 |
+
* [System instructions](/gemini-api/docs/text-generation#system-instructions): System instructions let you steer the behavior of the model based on your specific needs and use cases.
|
| 220 |
+
* [Files API](/gemini-api/docs/files): Learn more about uploading and managing files for use with Gemini.
|
| 221 |
+
* [File prompting strategies](/gemini-api/docs/files#prompt-guide): The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting.
|
| 222 |
+
* [Safety guidance](/gemini-api/docs/safety-guidance): Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs.
|
| 223 |
+
|
| 224 |
+
|
templates/index.html
CHANGED
|
@@ -181,6 +181,33 @@
|
|
| 181 |
const formData = new FormData();
|
| 182 |
formData.append('file', file);
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
fetch('/upload', {
|
| 185 |
method: 'POST',
|
| 186 |
body: formData
|
|
@@ -189,19 +216,14 @@
|
|
| 189 |
.then(data => {
|
| 190 |
if (data.success) {
|
| 191 |
currentFile = data;
|
| 192 |
-
filePreview.innerHTML = `
|
| 193 |
-
<div class="flex justify-between items-center">
|
| 194 |
-
<span class="text-gray-300">📎 ${data.filename}</span>
|
| 195 |
-
<button onclick="removeFile()" class="bg-gray-600 hover:bg-gray-500 text-white px-3 py-1 rounded transition-colors">✕</button>
|
| 196 |
-
</div>
|
| 197 |
-
`;
|
| 198 |
-
filePreview.classList.remove('hidden');
|
| 199 |
} else {
|
| 200 |
showError('Erreur lors du téléchargement: ' + data.error);
|
|
|
|
| 201 |
}
|
| 202 |
})
|
| 203 |
.catch(error => {
|
| 204 |
showError('Erreur lors du téléchargement: ' + error.message);
|
|
|
|
| 205 |
});
|
| 206 |
}
|
| 207 |
|
|
@@ -223,8 +245,8 @@
|
|
| 223 |
messageInput.disabled = true;
|
| 224 |
sendButton.disabled = true;
|
| 225 |
|
| 226 |
-
if (message) {
|
| 227 |
-
addMessage('user', message);
|
| 228 |
}
|
| 229 |
|
| 230 |
typingIndicator.classList.remove('hidden');
|
|
@@ -330,19 +352,30 @@
|
|
| 330 |
});
|
| 331 |
}
|
| 332 |
|
| 333 |
-
function addMessage(role, content) {
|
| 334 |
const messagesContainer = document.getElementById('messages');
|
| 335 |
const messageDiv = document.createElement('div');
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
if (role === 'user') {
|
| 338 |
messageDiv.className = 'max-w-[85%] p-4 rounded-2xl bg-gray-600 text-gray-100 self-end rounded-br-sm animate-fade-in';
|
| 339 |
-
const markdownHtml = marked.parse(content);
|
| 340 |
-
messageDiv.innerHTML = `<div class="markdown-content">${markdownHtml}</div>`;
|
| 341 |
} else {
|
| 342 |
messageDiv.className = 'max-w-[85%] p-4 rounded-2xl bg-dark-elevated text-gray-200 self-start rounded-bl-sm animate-fade-in';
|
| 343 |
-
messageDiv.innerHTML = `<div class="markdown-content">${content}</div>`;
|
| 344 |
}
|
| 345 |
-
|
|
|
|
|
|
|
| 346 |
messagesContainer.appendChild(messageDiv);
|
| 347 |
messagesContainer.scrollTop = messagesContainer.scrollHeight;
|
| 348 |
return messageDiv;
|
|
|
|
| 181 |
const formData = new FormData();
|
| 182 |
formData.append('file', file);
|
| 183 |
|
| 184 |
+
// Show immediate preview for images
|
| 185 |
+
if (file.type.startsWith('image/')) {
|
| 186 |
+
const reader = new FileReader();
|
| 187 |
+
reader.onload = function(e) {
|
| 188 |
+
filePreview.innerHTML = `
|
| 189 |
+
<div class="flex items-center gap-3">
|
| 190 |
+
<img src="${e.target.result}" alt="${file.name}" class="w-16 h-16 object-cover rounded-lg border border-gray-600">
|
| 191 |
+
<div class="flex-1">
|
| 192 |
+
<div class="text-gray-300 text-sm font-medium">${file.name}</div>
|
| 193 |
+
<div class="text-gray-500 text-xs">${(file.size / 1024).toFixed(1)} KB</div>
|
| 194 |
+
</div>
|
| 195 |
+
<button onclick="removeFile()" class="bg-gray-600 hover:bg-gray-500 text-white px-3 py-1 rounded transition-colors">✕</button>
|
| 196 |
+
</div>
|
| 197 |
+
`;
|
| 198 |
+
filePreview.classList.remove('hidden');
|
| 199 |
+
};
|
| 200 |
+
reader.readAsDataURL(file);
|
| 201 |
+
} else {
|
| 202 |
+
filePreview.innerHTML = `
|
| 203 |
+
<div class="flex justify-between items-center">
|
| 204 |
+
<span class="text-gray-300">📎 ${file.name}</span>
|
| 205 |
+
<button onclick="removeFile()" class="bg-gray-600 hover:bg-gray-500 text-white px-3 py-1 rounded transition-colors">✕</button>
|
| 206 |
+
</div>
|
| 207 |
+
`;
|
| 208 |
+
filePreview.classList.remove('hidden');
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
fetch('/upload', {
|
| 212 |
method: 'POST',
|
| 213 |
body: formData
|
|
|
|
| 216 |
.then(data => {
|
| 217 |
if (data.success) {
|
| 218 |
currentFile = data;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
} else {
|
| 220 |
showError('Erreur lors du téléchargement: ' + data.error);
|
| 221 |
+
filePreview.classList.add('hidden');
|
| 222 |
}
|
| 223 |
})
|
| 224 |
.catch(error => {
|
| 225 |
showError('Erreur lors du téléchargement: ' + error.message);
|
| 226 |
+
filePreview.classList.add('hidden');
|
| 227 |
});
|
| 228 |
}
|
| 229 |
|
|
|
|
| 245 |
messageInput.disabled = true;
|
| 246 |
sendButton.disabled = true;
|
| 247 |
|
| 248 |
+
if (message || currentFile) {
|
| 249 |
+
addMessage('user', message, currentFile);
|
| 250 |
}
|
| 251 |
|
| 252 |
typingIndicator.classList.remove('hidden');
|
|
|
|
| 352 |
});
|
| 353 |
}
|
| 354 |
|
| 355 |
+
function addMessage(role, content, fileData = null) {
|
| 356 |
const messagesContainer = document.getElementById('messages');
|
| 357 |
const messageDiv = document.createElement('div');
|
| 358 |
+
|
| 359 |
+
let messageContent = '';
|
| 360 |
+
|
| 361 |
+
if (fileData && fileData.mime_type && fileData.mime_type.startsWith('image/')) {
|
| 362 |
+
const imageUrl = `data:${fileData.mime_type};base64,${fileData.data}`;
|
| 363 |
+
messageContent += `<img src="${imageUrl}" alt="${fileData.filename}" class="max-w-full h-auto rounded-lg mb-2 border border-gray-600">`;
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
if (content) {
|
| 367 |
+
const markdownHtml = marked.parse(content);
|
| 368 |
+
messageContent += `<div class="markdown-content">${markdownHtml}</div>`;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
if (role === 'user') {
|
| 372 |
messageDiv.className = 'max-w-[85%] p-4 rounded-2xl bg-gray-600 text-gray-100 self-end rounded-br-sm animate-fade-in';
|
|
|
|
|
|
|
| 373 |
} else {
|
| 374 |
messageDiv.className = 'max-w-[85%] p-4 rounded-2xl bg-dark-elevated text-gray-200 self-start rounded-bl-sm animate-fade-in';
|
|
|
|
| 375 |
}
|
| 376 |
+
|
| 377 |
+
messageDiv.innerHTML = messageContent;
|
| 378 |
+
|
| 379 |
messagesContainer.appendChild(messageDiv);
|
| 380 |
messagesContainer.scrollTop = messagesContainer.scrollHeight;
|
| 381 |
return messageDiv;
|