Commit 26fbacb7
Changed files (6)
examples
embeddings
examples/embeddings/Code_search.ipynb
@@ -260,7 +260,7 @@
"def format_inferrer_validator(df):\n",
" \"\"\"\n",
" This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.\n",
- " It will also suggest to use ada, --no_packing and explain train/validation split benefits.\n",
+ " It will also suggest to use ada and explain train/validation split benefits.\n",
" \"\"\"\n",
" ft_type = infer_task_type(df)\n",
" immediate_msg = None\n",
examples/finetuning/finetuning-classification.ipynb
@@ -257,7 +257,7 @@
"\n",
"- Your file contains 1197 prompt-completion pairs\n",
"- Based on your data it seems like you're trying to fine-tune a model for classification\n",
- "- For classification, we recommend you try one of the faster and cheaper models, such as `ada`. You should also set the `--no_packing` parameter when fine-tuning\n",
+ "- For classification, we recommend you try one of the faster and cheaper models, such as `ada`\n",
"- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training\n",
"- There are 11 examples that are very long. These are rows: [134, 200, 281, 320, 404, 595, 704, 838, 1113, 1139, 1174]\n",
"For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.\n",
@@ -277,7 +277,7 @@
"Feel free to take a look!\n",
"\n",
"Now use that file when fine-tuning:\n",
- "> openai api fine_tunes.create -t \"sport2_prepared_train.jsonl\" -v \"sport2_prepared_valid.jsonl\" --no_packing --compute_classification_metrics --classification_positive_class \" baseball\"\n",
+ "> openai api fine_tunes.create -t \"sport2_prepared_train.jsonl\" -v \"sport2_prepared_valid.jsonl\" --compute_classification_metrics --classification_positive_class \" baseball\"\n",
"\n",
"After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\\n\\n###\\n\\n` for the model to start generating completions, rather than continuing with the prompt.\n",
"Once your model starts training, it'll approximately take 30.8 minutes to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.\n"
@@ -301,7 +301,7 @@
"cell_type": "markdown",
"source": [
"## Fine-tuning\n",
- "The tool suggests we run the following command to train the dataset. Since this is a classification task, we would like to know what the generalization performance on the provided validation set is for our classification use case. The tool suggests to add `--compute_classification_metrics --classification_positive_class \" baseball\"` in order to compute the classification metrics. Classification performs better with a hyperparameter `--no_packing`.\n",
+ "The tool suggests we run the following command to train the dataset. Since this is a classification task, we would like to know what the generalization performance on the provided validation set is for our classification use case. The tool suggests to add `--compute_classification_metrics --classification_positive_class \" baseball\"` in order to compute the classification metrics.\n",
"\n",
"We can simply copy the suggested command from the CLI tool. We specifically add `-m ada` to fine-tune a cheaper and faster ada model, which is usually comperable in performance to slower and more expensive models on classification use cases. "
],
@@ -311,7 +311,7 @@
"cell_type": "code",
"execution_count": 9,
"source": [
- "!openai api fine_tunes.create -t \"sport2_prepared_train.jsonl\" -v \"sport2_prepared_valid.jsonl\" --no_packing --compute_classification_metrics --classification_positive_class \" baseball\" -m ada"
+ "!openai api fine_tunes.create -t \"sport2_prepared_train.jsonl\" -v \"sport2_prepared_valid.jsonl\" --compute_classification_metrics --classification_positive_class \" baseball\" -m ada"
],
"outputs": [
{
@@ -737,4 +737,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}
examples/finetuning/olympics-3-train-qa.ipynb
@@ -373,7 +373,7 @@
}
],
"source": [
- "!openai api fine_tunes.create -t \"olympics-data/discriminator_train.jsonl\" -v \"olympics-data/discriminator_test.jsonl\" --no_packing --batch_size 16 --compute_classification_metrics --classification_positive_class \" yes\" --model ada"
+ "!openai api fine_tunes.create -t \"olympics-data/discriminator_train.jsonl\" -v \"olympics-data/discriminator_test.jsonl\" --batch_size 16 --compute_classification_metrics --classification_positive_class \" yes\" --model ada"
]
},
{
@@ -391,7 +391,7 @@
}
],
"source": [
- "!openai api fine_tunes.create -t \"olympics-data/qa_train.jsonl\" -v \"olympics-data/qa_test.jsonl\" --no_packing --batch_size 16"
+ "!openai api fine_tunes.create -t \"olympics-data/qa_train.jsonl\" -v \"olympics-data/qa_test.jsonl\" --batch_size 16"
]
},
{
openai/cli.py
@@ -397,7 +397,6 @@ class FineTune:
"batch_size",
"learning_rate_multiplier",
"prompt_loss_weight",
- "use_packing",
"compute_classification_metrics",
"classification_n_classes",
"classification_positive_class",
@@ -891,23 +890,6 @@ Mutually exclusive with `top_p`.""",
"learning rate is determined by the original learning rate used for "
"pretraining multiplied by this value.",
)
- sub.add_argument(
- "--use_packing",
- action="store_true",
- dest="use_packing",
- help="On classification tasks, we recommend not setting this flag. "
- "On all other tasks, we recommend setting it. "
- "When set, we pack as many prompt-completion pairs as possible into each "
- "training example. This greatly increases the speed of a fine-tuning job, "
- "often without negatively affecting model performance.",
- )
- sub.add_argument(
- "--no_packing",
- action="store_false",
- dest="use_packing",
- help="Disables the packing flag (see --use_packing for description).",
- )
- sub.set_defaults(use_packing=None)
sub.add_argument(
"--prompt_loss_weight",
type=float,
openai/validators.py
@@ -2,7 +2,6 @@ import os
import sys
from typing import Any, Callable, NamedTuple, Optional
-import numpy as np
import pandas as pd
@@ -535,12 +534,12 @@ def read_any_format(fname, fields=["prompt", "completion"]):
def format_inferrer_validator(df):
"""
This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
- It will also suggest to use ada, --no_packing and explain train/validation split benefits.
+ It will also suggest to use ada and explain train/validation split benefits.
"""
ft_type = infer_task_type(df)
immediate_msg = None
if ft_type == "classification":
- immediate_msg = f"\n- Based on your data it seems like you're trying to fine-tune a model for {ft_type}\n- For classification, we recommend you try one of the faster and cheaper models, such as `ada`. You should also set the `--no_packing` parameter when fine-tuning\n- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training"
+ immediate_msg = f"\n- Based on your data it seems like you're trying to fine-tune a model for {ft_type}\n- For classification, we recommend you try one of the faster and cheaper models, such as `ada`\n- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training"
return Remediation(name="num_examples", immediate_msg=immediate_msg)
@@ -634,27 +633,6 @@ def get_classification_hyperparams(df):
return n_classes, pos_class
-def get_batch_size_suggestion(df, no_packing):
- """
- Suggest the batch size based on the number of examples after packing optionally is applied.
- """
- n_examples, n_characters = (
- len(df),
- df.completion.str.len().sum() + df.prompt.str.len().sum(),
- )
- BATCH_SIZE_TO_N_EXAMPLES_RATIO = 0.002
- BATCH_SIZE_TO_N_CHARACTERS_RATIO = BATCH_SIZE_TO_N_EXAMPLES_RATIO / 10_000
-
- if no_packing:
- batch_size = BATCH_SIZE_TO_N_EXAMPLES_RATIO * n_examples
- else:
- batch_size = BATCH_SIZE_TO_N_CHARACTERS_RATIO * n_characters
-
- batch_size = max(1, int(2 ** np.ceil(np.log2(batch_size))))
- batch_size_suggestion = f" --batch_size {batch_size}"
- return batch_size_suggestion
-
-
def write_out_file(df, fname, any_remediations, auto_accept):
"""
This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file.
@@ -670,14 +648,7 @@ def write_out_file(df, fname, any_remediations, auto_accept):
if accept_suggestion(input_text, auto_accept):
split = True
- no_packing = ft_format == "classification" or (
- ft_format == "conditional generation" and len(df) < 1000
- )
additional_params = ""
- if no_packing:
- additional_params = " --no_packing"
- additional_params += get_batch_size_suggestion(df, no_packing)
-
common_prompt_suffix_new_line_handled = common_prompt_suffix.replace("\n", "\\n")
common_completion_suffix_new_line_handled = common_completion_suffix.replace(
"\n", "\\n"
openai/version.py
@@ -1,1 +1,1 @@
-VERSION = "0.11.4"
+VERSION = "0.11.5"