Commit 66a0b8d4

stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com>
2025-05-17 03:41:25
feat(api): further updates for evals API
1 parent e5de794
src/openai/resources/evals/runs/runs.py
@@ -72,9 +72,10 @@ class Runs(SyncAPIResource):
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> RunCreateResponse:
-        """Create a new evaluation run.
-
-        This is the endpoint that will kick off grading.
+        """
+        Kicks off a new run for a given evaluation, specifying the data source, and what
+        model configuration to use to test. The datasource will be validated against the
+        schema specified in the config of the evaluation.
 
         Args:
           data_source: Details about the run's data source.
@@ -321,9 +322,10 @@ class AsyncRuns(AsyncAPIResource):
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> RunCreateResponse:
-        """Create a new evaluation run.
-
-        This is the endpoint that will kick off grading.
+        """
+        Kicks off a new run for a given evaluation, specifying the data source, and what
+        model configuration to use to test. The datasource will be validated against the
+        schema specified in the config of the evaluation.
 
         Args:
           data_source: Details about the run's data source.
src/openai/resources/evals/evals.py
@@ -74,15 +74,20 @@ class Evals(SyncAPIResource):
     ) -> EvalCreateResponse:
         """
         Create the structure of an evaluation that can be used to test a model's
-        performance. An evaluation is a set of testing criteria and a datasource. After
+        performance. An evaluation is a set of testing criteria and the config for a
+        data source, which dictates the schema of the data used in the evaluation. After
         creating an evaluation, you can run it on different models and model parameters.
         We support several types of graders and datasources. For more information, see
         the [Evals guide](https://platform.openai.com/docs/guides/evals).
 
         Args:
-          data_source_config: The configuration for the data source used for the evaluation runs.
+          data_source_config: The configuration for the data source used for the evaluation runs. Dictates the
+              schema of the data used in the evaluation.
 
-          testing_criteria: A list of graders for all eval runs in this group.
+          testing_criteria: A list of graders for all eval runs in this group. Graders can reference
+              variables in the data source using double curly braces notation, like
+              `{{item.variable_name}}`. To reference the model's output, use the `sample`
+              namespace (ie, `{{sample.output_text}}`).
 
           metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
               for storing additional information about the object in a structured format, and
@@ -333,15 +338,20 @@ class AsyncEvals(AsyncAPIResource):
     ) -> EvalCreateResponse:
         """
         Create the structure of an evaluation that can be used to test a model's
-        performance. An evaluation is a set of testing criteria and a datasource. After
+        performance. An evaluation is a set of testing criteria and the config for a
+        data source, which dictates the schema of the data used in the evaluation. After
         creating an evaluation, you can run it on different models and model parameters.
         We support several types of graders and datasources. For more information, see
         the [Evals guide](https://platform.openai.com/docs/guides/evals).
 
         Args:
-          data_source_config: The configuration for the data source used for the evaluation runs.
+          data_source_config: The configuration for the data source used for the evaluation runs. Dictates the
+              schema of the data used in the evaluation.
 
-          testing_criteria: A list of graders for all eval runs in this group.
+          testing_criteria: A list of graders for all eval runs in this group. Graders can reference
+              variables in the data source using double curly braces notation, like
+              `{{item.variable_name}}`. To reference the model's output, use the `sample`
+              namespace (ie, `{{sample.output_text}}`).
 
           metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
               for storing additional information about the object in a structured format, and
src/openai/types/beta/realtime/transcription_session_updated_event.py
@@ -16,7 +16,7 @@ class TranscriptionSessionUpdatedEvent(BaseModel):
     """A new Realtime transcription session configuration.
 
     When a session is created on the server via REST API, the session object also
-    contains an ephemeral key. Default TTL for keys is one minute. This property is
+    contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
     not present when a session is updated via the WebSocket API.
     """
 
src/openai/types/evals/create_eval_completions_run_data_source.py
@@ -117,7 +117,7 @@ class InputMessagesTemplate(BaseModel):
     template: List[InputMessagesTemplateTemplate]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Literal["template"]
@@ -126,7 +126,7 @@ class InputMessagesTemplate(BaseModel):
 
 class InputMessagesItemReference(BaseModel):
     item_reference: str
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.input_trajectory" """
 
     type: Literal["item_reference"]
     """The type of input messages. Always `item_reference`."""
@@ -153,12 +153,18 @@ class SamplingParams(BaseModel):
 
 class CreateEvalCompletionsRunDataSource(BaseModel):
     source: Source
-    """A StoredCompletionsRunDataSource configuration describing a set of filters"""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Literal["completions"]
     """The type of run data source. Always `completions`."""
 
     input_messages: Optional[InputMessages] = None
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: Optional[str] = None
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/evals/create_eval_completions_run_data_source_param.py
@@ -113,7 +113,7 @@ class InputMessagesTemplate(TypedDict, total=False):
     template: Required[Iterable[InputMessagesTemplateTemplate]]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Required[Literal["template"]]
@@ -122,7 +122,7 @@ class InputMessagesTemplate(TypedDict, total=False):
 
 class InputMessagesItemReference(TypedDict, total=False):
     item_reference: Required[str]
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.input_trajectory" """
 
     type: Required[Literal["item_reference"]]
     """The type of input messages. Always `item_reference`."""
@@ -147,12 +147,18 @@ class SamplingParams(TypedDict, total=False):
 
 class CreateEvalCompletionsRunDataSourceParam(TypedDict, total=False):
     source: Required[Source]
-    """A StoredCompletionsRunDataSource configuration describing a set of filters"""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Required[Literal["completions"]]
     """The type of run data source. Always `completions`."""
 
     input_messages: InputMessages
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: str
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/evals/create_eval_jsonl_run_data_source.py
@@ -36,6 +36,7 @@ Source: TypeAlias = Annotated[Union[SourceFileContent, SourceFileID], PropertyIn
 
 class CreateEvalJSONLRunDataSource(BaseModel):
     source: Source
+    """Determines what populates the `item` namespace in the data source."""
 
     type: Literal["jsonl"]
     """The type of data source. Always `jsonl`."""
src/openai/types/evals/create_eval_jsonl_run_data_source_param.py
@@ -41,6 +41,7 @@ Source: TypeAlias = Union[SourceFileContent, SourceFileID]
 
 class CreateEvalJSONLRunDataSourceParam(TypedDict, total=False):
     source: Required[Source]
+    """Determines what populates the `item` namespace in the data source."""
 
     type: Required[Literal["jsonl"]]
     """The type of data source. Always `jsonl`."""
src/openai/types/evals/run_cancel_response.py
@@ -76,12 +76,6 @@ class DataSourceResponsesSourceResponses(BaseModel):
     This is a query parameter used to select responses.
     """
 
-    has_tool_calls: Optional[bool] = None
-    """Whether the response has tool calls.
-
-    This is a query parameter used to select responses.
-    """
-
     instructions_search: Optional[str] = None
     """Optional string to search the 'instructions' field.
 
@@ -170,7 +164,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
     template: List[DataSourceResponsesInputMessagesTemplateTemplate]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Literal["template"]
@@ -179,7 +173,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
 
 class DataSourceResponsesInputMessagesItemReference(BaseModel):
     item_reference: str
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.name" """
 
     type: Literal["item_reference"]
     """The type of input messages. Always `item_reference`."""
@@ -207,12 +201,18 @@ class DataSourceResponsesSamplingParams(BaseModel):
 
 class DataSourceResponses(BaseModel):
     source: DataSourceResponsesSource
-    """A EvalResponsesSource object describing a run data source configuration."""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Literal["responses"]
     """The type of run data source. Always `responses`."""
 
     input_messages: Optional[DataSourceResponsesInputMessages] = None
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: Optional[str] = None
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/evals/run_create_params.py
@@ -88,12 +88,6 @@ class DataSourceCreateEvalResponsesRunDataSourceSourceResponses(TypedDict, total
     This is a query parameter used to select responses.
     """
 
-    has_tool_calls: Optional[bool]
-    """Whether the response has tool calls.
-
-    This is a query parameter used to select responses.
-    """
-
     instructions_search: Optional[str]
     """Optional string to search the 'instructions' field.
 
@@ -187,7 +181,7 @@ class DataSourceCreateEvalResponsesRunDataSourceInputMessagesTemplate(TypedDict,
     template: Required[Iterable[DataSourceCreateEvalResponsesRunDataSourceInputMessagesTemplateTemplate]]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Required[Literal["template"]]
@@ -196,7 +190,7 @@ class DataSourceCreateEvalResponsesRunDataSourceInputMessagesTemplate(TypedDict,
 
 class DataSourceCreateEvalResponsesRunDataSourceInputMessagesItemReference(TypedDict, total=False):
     item_reference: Required[str]
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.name" """
 
     type: Required[Literal["item_reference"]]
     """The type of input messages. Always `item_reference`."""
@@ -224,12 +218,18 @@ class DataSourceCreateEvalResponsesRunDataSourceSamplingParams(TypedDict, total=
 
 class DataSourceCreateEvalResponsesRunDataSource(TypedDict, total=False):
     source: Required[DataSourceCreateEvalResponsesRunDataSourceSource]
-    """A EvalResponsesSource object describing a run data source configuration."""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Required[Literal["responses"]]
     """The type of run data source. Always `responses`."""
 
     input_messages: DataSourceCreateEvalResponsesRunDataSourceInputMessages
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: str
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/evals/run_create_response.py
@@ -76,12 +76,6 @@ class DataSourceResponsesSourceResponses(BaseModel):
     This is a query parameter used to select responses.
     """
 
-    has_tool_calls: Optional[bool] = None
-    """Whether the response has tool calls.
-
-    This is a query parameter used to select responses.
-    """
-
     instructions_search: Optional[str] = None
     """Optional string to search the 'instructions' field.
 
@@ -170,7 +164,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
     template: List[DataSourceResponsesInputMessagesTemplateTemplate]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Literal["template"]
@@ -179,7 +173,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
 
 class DataSourceResponsesInputMessagesItemReference(BaseModel):
     item_reference: str
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.name" """
 
     type: Literal["item_reference"]
     """The type of input messages. Always `item_reference`."""
@@ -207,12 +201,18 @@ class DataSourceResponsesSamplingParams(BaseModel):
 
 class DataSourceResponses(BaseModel):
     source: DataSourceResponsesSource
-    """A EvalResponsesSource object describing a run data source configuration."""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Literal["responses"]
     """The type of run data source. Always `responses`."""
 
     input_messages: Optional[DataSourceResponsesInputMessages] = None
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: Optional[str] = None
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/evals/run_list_response.py
@@ -76,12 +76,6 @@ class DataSourceResponsesSourceResponses(BaseModel):
     This is a query parameter used to select responses.
     """
 
-    has_tool_calls: Optional[bool] = None
-    """Whether the response has tool calls.
-
-    This is a query parameter used to select responses.
-    """
-
     instructions_search: Optional[str] = None
     """Optional string to search the 'instructions' field.
 
@@ -170,7 +164,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
     template: List[DataSourceResponsesInputMessagesTemplateTemplate]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Literal["template"]
@@ -179,7 +173,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
 
 class DataSourceResponsesInputMessagesItemReference(BaseModel):
     item_reference: str
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.name" """
 
     type: Literal["item_reference"]
     """The type of input messages. Always `item_reference`."""
@@ -207,12 +201,18 @@ class DataSourceResponsesSamplingParams(BaseModel):
 
 class DataSourceResponses(BaseModel):
     source: DataSourceResponsesSource
-    """A EvalResponsesSource object describing a run data source configuration."""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Literal["responses"]
     """The type of run data source. Always `responses`."""
 
     input_messages: Optional[DataSourceResponsesInputMessages] = None
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: Optional[str] = None
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/evals/run_retrieve_response.py
@@ -76,12 +76,6 @@ class DataSourceResponsesSourceResponses(BaseModel):
     This is a query parameter used to select responses.
     """
 
-    has_tool_calls: Optional[bool] = None
-    """Whether the response has tool calls.
-
-    This is a query parameter used to select responses.
-    """
-
     instructions_search: Optional[str] = None
     """Optional string to search the 'instructions' field.
 
@@ -170,7 +164,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
     template: List[DataSourceResponsesInputMessagesTemplateTemplate]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     type: Literal["template"]
@@ -179,7 +173,7 @@ class DataSourceResponsesInputMessagesTemplate(BaseModel):
 
 class DataSourceResponsesInputMessagesItemReference(BaseModel):
     item_reference: str
-    """A reference to a variable in the "item" namespace. Ie, "item.name" """
+    """A reference to a variable in the `item` namespace. Ie, "item.name" """
 
     type: Literal["item_reference"]
     """The type of input messages. Always `item_reference`."""
@@ -207,12 +201,18 @@ class DataSourceResponsesSamplingParams(BaseModel):
 
 class DataSourceResponses(BaseModel):
     source: DataSourceResponsesSource
-    """A EvalResponsesSource object describing a run data source configuration."""
+    """Determines what populates the `item` namespace in this run's data source."""
 
     type: Literal["responses"]
     """The type of run data source. Always `responses`."""
 
     input_messages: Optional[DataSourceResponsesInputMessages] = None
+    """Used when sampling from a model.
+
+    Dictates the structure of the messages passed into the model. Can either be a
+    reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template
+    with variable references to the `item` namespace.
+    """
 
     model: Optional[str] = None
     """The name of the model to use for generating completions (e.g. "o3-mini")."""
src/openai/types/eval_create_params.py
@@ -33,10 +33,18 @@ __all__ = [
 
 class EvalCreateParams(TypedDict, total=False):
     data_source_config: Required[DataSourceConfig]
-    """The configuration for the data source used for the evaluation runs."""
+    """The configuration for the data source used for the evaluation runs.
+
+    Dictates the schema of the data used in the evaluation.
+    """
 
     testing_criteria: Required[Iterable[TestingCriterion]]
-    """A list of graders for all eval runs in this group."""
+    """A list of graders for all eval runs in this group.
+
+    Graders can reference variables in the data source using double curly braces
+    notation, like `{{item.variable_name}}`. To reference the model's output, use
+    the `sample` namespace (ie, `{{sample.output_text}}`).
+    """
 
     metadata: Optional[Metadata]
     """Set of 16 key-value pairs that can be attached to an object.
@@ -75,8 +83,8 @@ class DataSourceConfigLogs(TypedDict, total=False):
 
 
 class DataSourceConfigStoredCompletions(TypedDict, total=False):
-    type: Required[Literal["stored-completions"]]
-    """The type of data source. Always `stored-completions`."""
+    type: Required[Literal["stored_completions"]]
+    """The type of data source. Always `stored_completions`."""
 
     metadata: Dict[str, object]
     """Metadata filters for the stored completions data source."""
@@ -129,7 +137,7 @@ class TestingCriterionLabelModel(TypedDict, total=False):
     input: Required[Iterable[TestingCriterionLabelModelInput]]
     """A list of chat messages forming the prompt or context.
 
-    May include variable references to the "item" namespace, ie {{item.name}}.
+    May include variable references to the `item` namespace, ie {{item.name}}.
     """
 
     labels: Required[List[str]]
src/openai/types/eval_stored_completions_data_source_config.py
@@ -18,8 +18,8 @@ class EvalStoredCompletionsDataSourceConfig(BaseModel):
     [here](https://json-schema.org/).
     """
 
-    type: Literal["stored-completions"]
-    """The type of data source. Always `stored-completions`."""
+    type: Literal["stored_completions"]
+    """The type of data source. Always `stored_completions`."""
 
     metadata: Optional[Metadata] = None
     """Set of 16 key-value pairs that can be attached to an object.
.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 101
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-5fa16b9a02985ae06e41be14946a9c325dc672fb014b3c19abca65880c6990e6.yml
-openapi_spec_hash: da3e669f65130043b1170048c0727890
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-262e171d0a8150ea1192474d16ba3afdf9a054b399f1a49a9c9b697a3073c136.yml
+openapi_spec_hash: 33e00a48df8f94c94f46290c489f132b
 config_hash: d8d5fda350f6db77c784f35429741a2e