Commit `7febb755`

Boris Power <81998504+BorisPower@users.noreply.github.com>

2021-08-31 23:56:03

Boris/examples and cli (#32) tag: v0.10.3

* Add a codex backtranslation example to improve SQL queries (#58) * Add a codex backtranslation example to improve SQL queries * Boris update ft example (#57) * update fine-tune example to show the new CLI outputs * model specifiction for search (#60) * Catch chunked encoding errors and retry (#63) * Add batch suggestion logic to prepare_data for fine_tunes and custom Q&A answers logic (#62) * Add batch suggestion logic to prepare_data for fine_tunes; add an example of how to create a rudimentary answers endpoint with a custom Q&A model Co-authored-by: Madeleine Thompson <madeleine@openai.com> Co-authored-by: hallacy <hallacy@openai.com>

main

1 parent c79fefc

Changed files (7)

examples

codex

backtranslation.py

finetuning

answers-with-ft.py

finetuning-classification.ipynb

openai

@@ -0,0 +1,187 @@
+import openai
+from smokey import Smokey
+from typing import List, Union
+
+
+def get_candidates(
+    prompt: str,
+    stop: List[str],
+    temperature: float,
+    priming_prefix: str,
+    engine: str,
+    n: int = 5,
+) -> List[str]:
+    """
+    Generate N candidate completions based on the prompt, generated with a specific temperature.
+
+    :param prompt: The prompt to start the conversation with.
+    :param stop: A list of tokens that indicate the end of the generation.
+    :param temperature: The temperature of the generation.
+    :param priming_prefix: The prefix to use for the priming.
+    :param engine: The engine to use for the generation.
+    :param n: The number of completions to generate.
+    :return: A list of completions.
+    """
+    response = openai.Completion.create(
+        engine=engine,
+        prompt=prompt,
+        temperature=temperature,
+        max_tokens=150,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0,
+        stop=stop,
+        n=n,
+    )
+    responses = [priming_prefix + choice.text for choice in response.choices]
+    return responses
+
+
+def rindex(lst: List, value: str) -> int:
+    """
+    Return the index of the last occurence of a value in a list.
+
+    :param lst: The list to search in.
+    :param value: The value to search for.
+    :return: The index of the last occurence of the value.
+    """
+    try:
+        return len(lst) - lst[::-1].index(value) - 1
+    except ValueError:
+        raise ValueError(f"Answer start token `{value}` not found in the eval template")
+
+
+def eval_candidate(
+    candidate_answer: str,
+    original_instruction: str,
+    eval_template: str,
+    answer_start_token: str,
+    engine: str,
+) -> float:
+    """
+    Evaluate a candidate answer by calculating the average log probability
+    of the original instruction, given the candidate answer with a specific
+    evaluation template, aimed at reconstructing the original instruction.
+
+    :param candidate_answer: The candidate answer to evaluate.
+    :param original_instruction: The original instruction.
+    :param eval_template: The template to use for the evaluation.
+    :param answer_start_token: The token to use to indicate the start of the answer.
+    :param engine: The engine to use for the evaluation.
+    :return: The evaluation of the candidate answer.
+    """
+    response = openai.Completion.create(
+        engine=engine,
+        prompt=eval_template.format(candidate_answer, original_instruction),
+        temperature=0,
+        max_tokens=0,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0,
+        logprobs=1,
+        echo=True,
+    )
+
+    answer_start = rindex(
+        response["choices"][0]["logprobs"]["tokens"], answer_start_token
+    )
+    logprobs = response["choices"][0]["logprobs"]["token_logprobs"][answer_start + 1 :]
+    return sum(logprobs) / len(logprobs)
+
+
+def backtranslation(
+    prompt_template: str,
+    additional_info: str,
+    instruction: str,
+    eval_template: str,
+    priming_prefix: str = "SELECT",
+    stop1: List[str] = ["#", ";"],
+    answer_start_token: str = "--",
+    n: int = 5,
+    temperature: float = 0.5,
+    return_all_results: bool = False,
+    engine: str = "davinci-codex",
+) -> Union[str, List[str, float]]:
+    """
+    Generate a number of SQL queries given a natural language instruction,
+    and pick the best one based on the average log probability of explaining the
+    candidate SQL query with the exact original instruction, when prompted for
+    a natural language explanation of the candidate SQL query.
+
+    :param prompt_template: The template to use for the prompt to generate SQL.
+    :param additional_info: Additional information to include in the prompt
+                            (SQL Tables, and their properties).
+    :param instruction: The instruction in natural language.
+    :param eval_template: The template to use for the evaluation.
+    :param priming_prefix: The prefix to use for the priming of the SQL query.
+    :param stop1: A list of tokens that indicate the end of the generation.
+    :param answer_start_token: The token to use to indicate the start of the
+                               natural answer.
+    :param n: The number of candidates to generate.
+    :param temperature: The temperature of the generation.
+    :param return_all_results: Whether to return all results or just the best one.
+    :param engine: The engine to use for the generation and evaluation.
+    :return: The best SQL query, or a list of all scored generated SQL queries.
+    """
+    prompt_template = prompt_template.format(
+        additional_info, instruction, priming_prefix
+    )
+
+    candidates = []
+    responses = get_candidates(
+        prompt_template, stop1, temperature, priming_prefix, engine=engine, n=n
+    )
+    for i in range(n):
+        quality = eval_candidate(
+            responses[i],
+            instruction,
+            eval_template,
+            answer_start_token,
+            engine=engine,
+        )
+        candidates.append((responses[i], quality))
+
+    candidates.sort(key=lambda x: x[1], reverse=True)
+    if return_all_results:
+        return candidates
+    return candidates[0][0]
+
+
+def main(
+    nl_query: str = "Return the name of each department that had more than 10 employees in June 2021",
+    eval_template: str = "{};\n-- Explanation of the above query in human readable format\n-- {}",
+    table_definitions: str = "# Employee(id, name, department_id)\n# Department(id, name, address)\n# Salary_Payments(id, employee_id, amount, date)\n",
+    prompt_template: str = "### Postgres SQL tables, with their properties:\n#\n{}#\n### {}\n{}",
+    n: int = 3,
+    temperature: float = 0.3,
+    engine: str = "davinci-codex",
+):
+    """
+    Generate a number of SQL queries given a natural language instruction,
+    and pick the best one based on the highest backtranslation score.
+
+    :param nl_query: The natural language query.
+    :param eval_template: The template to use for the evaluation.
+    :param table_definitions: The definitions of the tables used in the query.
+    :param prompt_template: The template to use for the prompt to generate SQL.
+    :param n: The number of candidates to generate.
+    :param temperature: The temperature of the generation.
+    :param engine: The engine to use for the generation and evaluation.
+    :return: The best SQL query, or a list of all scored generated SQL queries.
+    """
+
+    result = backtranslation(
+        prompt_template,
+        table_definitions,
+        nl_query,
+        eval_template,
+        priming_prefix="SELECT",
+        temperature=temperature,
+        n=n,
+        engine=engine,
+    )
+    print(result)
+
+
+if __name__ == "__main__":
+    Smokey(main)

@@ -0,0 +1,142 @@
+import openai
+import argparse
+
+
+def create_context(
+    question, search_file_id, max_len=1800, search_model="ada", max_rerank=10
+):
+    """
+    Create a context for a question by finding the most similar context from the search file.
+    :param question: The question
+    :param search_file_id: The file id of the search file
+    :param max_len: The maximum length of the returned context (in tokens)
+    :param search_model: The search model to use
+    :param max_rerank: The maximum number of reranking
+    :return: The context
+    """
+    results = openai.Engine(search_model).search(
+        search_model=search_model,
+        query=question,
+        max_rerank=max_rerank,
+        file=search_file_id,
+        return_metadata=True,
+    )
+    returns = []
+    cur_len = 0
+    for result in results["data"]:
+        cur_len += int(result["metadata"]) + 4
+        if cur_len > max_len:
+            break
+        returns.append(result["text"])
+    return "\n\n###\n\n".join(returns)
+
+
+def answer_question(
+    search_file_id="<SEARCH_FILE_ID>",
+    fine_tuned_qa_model="<FT_QA_MODEL_ID>",
+    question="Which country won the European Football championship in 2021?",
+    max_len=1800,
+    search_model="ada",
+    max_rerank=10,
+    debug=False,
+    stop_sequence=["\n", "."],
+    max_tokens=100,
+):
+    """
+    Answer a question based on the most similar context from the search file, using your fine-tuned model.
+    :param question: The question
+    :param fine_tuned_qa_model: The fine tuned QA model
+    :param search_file_id: The file id of the search file
+    :param max_len: The maximum length of the returned context (in tokens)
+    :param search_model: The search model to use
+    :param max_rerank: The maximum number of reranking
+    :param debug: Whether to output debug information
+    :param stop_sequence: The stop sequence for Q&A model
+    :param max_tokens: The maximum number of tokens to return
+    :return: The answer
+    """
+    context = create_context(
+        question,
+        search_file_id,
+        max_len=max_len,
+        search_model=search_model,
+        max_rerank=max_rerank,
+    )
+    if debug:
+        print("Context:\n" + context)
+        print("\n\n")
+    try:
+        response = openai.Completion.create(
+            model=fine_tuned_qa_model,
+            prompt=f"Answer the question based on the context below\n\nText: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
+            temperature=0,
+            max_tokens=max_tokens,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0,
+            stop=stop_sequence,
+        )
+        return response["choices"][0]["text"]
+    except Exception as e:
+        print(e)
+        return ""
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Rudimentary functionality of the answers endpoint with a fine-tuned Q&A model.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--search_file_id", help="Search file id", required=True, type=str
+    )
+    parser.add_argument(
+        "--fine_tuned_qa_model", help="Fine-tuned QA model id", required=True, type=str
+    )
+    parser.add_argument(
+        "--question", help="Question to answer", required=True, type=str
+    )
+    parser.add_argument(
+        "--max_len",
+        help="Maximum length of the returned context (in tokens)",
+        default=1800,
+        type=int,
+    )
+    parser.add_argument(
+        "--search_model", help="Search model to use", default="ada", type=str
+    )
+    parser.add_argument(
+        "--max_rerank",
+        help="Maximum number of reranking for the search",
+        default=10,
+        type=int,
+    )
+    parser.add_argument(
+        "--debug", help="Print debug information (context used)", action="store_true"
+    )
+    parser.add_argument(
+        "--stop_sequence",
+        help="Stop sequences for the Q&A model",
+        default=["\n", "."],
+        nargs="+",
+        type=str,
+    )
+    parser.add_argument(
+        "--max_tokens",
+        help="Maximum number of tokens to return",
+        default=100,
+        type=int,
+    )
+    args = parser.parse_args()
+    response = answer_question(
+        search_file_id=args.search_file_id,
+        fine_tuned_qa_model=args.fine_tuned_qa_model,
+        question=args.question,
+        max_len=args.max_len,
+        search_model=args.search_model,
+        max_rerank=args.max_rerank,
+        debug=args.debug,
+        stop_sequence=args.stop_sequence,
+        max_tokens=args.max_tokens,
+    )
+    print(f"Answer:{response}")

@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 1,
    "source": [
     "from sklearn.datasets import fetch_20newsgroups\n",
     "import pandas as pd\n",
@@ -33,7 +33,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "source": [
     "print(sports_dataset['data'][0])"
    ],
@@ -75,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "source": [
     "sports_dataset.target_names[sports_dataset['target'][0]]\n"
    ],
@@ -88,14 +88,14 @@
       ]
      },
      "metadata": {},
-     "execution_count": 5
+     "execution_count": 3
     }
    ],
    "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "source": [
     "len_all, len_baseball, len_hockey = len(sports_dataset.data), len([e for e in sports_dataset.target if e == 0]), len([e for e in sports_dataset.target if e == 1])\n",
     "print(f\"Total examples: {len_all}, Baseball examples: {len_baseball}, Hockey examples: {len_hockey}\")"
@@ -128,7 +128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "source": [
     "import pandas as pd\n",
     "\n",
@@ -204,7 +204,7 @@
       ]
      },
      "metadata": {},
-     "execution_count": 10
+     "execution_count": 5
     }
    ],
    "metadata": {}
@@ -218,9 +218,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
    "source": [
-    "df.to_json(\"sport1.jsonl\", orient='records', lines=True)"
+    "df.to_json(\"sport2.jsonl\", orient='records', lines=True)"
    ],
    "outputs": [],
    "metadata": {}
@@ -235,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "source": [
     "!pip install --upgrade openai"
    ],
@@ -244,9 +244,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "source": [
-    "!openai tools fine_tunes.prepare_data -f sport1.jsonl -q"
+    "!openai tools fine_tunes.prepare_data -f sport2.jsonl -q"
    ],
    "outputs": [
     {
@@ -259,21 +259,28 @@
       "- Based on your data it seems like you're trying to fine-tune a model for classification\n",
       "- For classification, we recommend you try one of the faster and cheaper models, such as `ada`. You should also set the `--no_packing` parameter when fine-tuning\n",
       "- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training\n",
+      "- There are 11 examples that are very long. These are rows: [134, 200, 281, 320, 404, 595, 704, 838, 1113, 1139, 1174]\n",
+      "For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.\n",
       "- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty\n",
       "- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details\n",
       "\n",
       "Based on the analysis we will perform the following actions:\n",
-      "- [Recommended] Add a suffix separator `\\n\\n###\\n\\n` to all prompts [Y/n]: Y- [Recommended] Add a whitespace character to the beginning of the completion [Y/n]: Y- [Recommended] Would you like to split into training and validation set? [Y/n]: Y\n",
+      "- [Recommended] Remove 11 long examples [Y/n]: Y\n",
+      "- [Recommended] Add a suffix separator `\\n\\n###\\n\\n` to all prompts [Y/n]: Y\n",
+      "- [Recommended] Add a whitespace character to the beginning of the completion [Y/n]: Y\n",
+      "- [Recommended] Would you like to split into training and validation set? [Y/n]: Y\n",
+      "\n",
       "\n",
       "Your data will be written to a new JSONL file. Proceed [Y/n]: Y\n",
-      "Wrote modified files to `sport1_prepared_train.jsonl` and `sport1_prepared_valid.jsonl`\n",
+      "\n",
+      "Wrote modified files to `sport2_prepared_train.jsonl` and `sport2_prepared_valid.jsonl`\n",
       "Feel free to take a look!\n",
       "\n",
       "Now use that file when fine-tuning:\n",
-      "> openai api fine_tunes.create -t \"sport1_prepared_train.jsonl\" -v \"sport1_prepared_valid.jsonl\" --no_packing\n",
+      "> openai api fine_tunes.create -t \"sport2_prepared_train.jsonl\" -v \"sport2_prepared_valid.jsonl\" --no_packing --compute_classification_metrics --classification_positive_class \" baseball\"\n",
       "\n",
       "After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\\n\\n###\\n\\n` for the model to start generating completions, rather than continuing with the prompt.\n",
-      "Once your model starts training, it'll approximately take 31.06 minutes. Queue will approximately take half an hour per job ahead of you.\n"
+      "Once your model starts training, it'll approximately take 30.8 minutes to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.\n"
      ]
     }
    ],
@@ -294,44 +301,46 @@
    "cell_type": "markdown",
    "source": [
     "## Fine-tuning\n",
-    "The tool suggests we run the following command to train the dataset. We specifically add `-m ada` to fine-tune a cheaper and faster ada model, which is usually comperable in performance to slower and more expensive models on classification use cases. Since this is a classification task, we would like to know what the generalization performance on the provided validation set is for our classification use case. We add `--compute_classification_metrics --classification_positive_class \" hockey\"` in order to compute the classification metrics."
+    "The tool suggests we run the following command to train the dataset. Since this is a classification task, we would like to know what the generalization performance on the provided validation set is for our classification use case. The tool suggests to add `--compute_classification_metrics --classification_positive_class \" baseball\"` in order to compute the classification metrics. Classification performs better with a hyperparameter `--no_packing`.\n",
+    "\n",
+    "We can simply copy the suggested command from the CLI tool. We specifically add `-m ada` to fine-tune a cheaper and faster ada model, which is usually comperable in performance to slower and more expensive models on classification use cases. "
    ],
    "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "source": [
-    "!openai api fine_tunes.create -t \"sport1_prepared_train.jsonl\" -v \"sport1_prepared_valid.jsonl\" --no_packing -m ada --compute_classification_metrics --classification_positive_class \" hockey\""
+    "!openai api fine_tunes.create -t \"sport2_prepared_train.jsonl\" -v \"sport2_prepared_valid.jsonl\" --no_packing --compute_classification_metrics --classification_positive_class \" baseball\" -m ada"
    ],
    "outputs": [
     {
      "output_type": "stream",
      "name": "stdout",
      "text": [
-      "Upload progress: 100%|████████████████████| 1.76M/1.76M [00:00<00:00, 1.85Mit/s]\n",
-      "Uploaded file from sport1_prepared_train.jsonl: file-6TJY51ApcI0YzumClqdpyhjk\n",
-      "Upload progress: 100%|███████████████████████| 395k/395k [00:00<00:00, 754kit/s]\n",
-      "Uploaded file from sport1_prepared_valid.jsonl: file-7jmZYAJHneAuzVGlauejsas9\n",
-      "Created fine-tune: ft-T4UkKqMbMM1Eu56q8ks6g8u5\n",
+      "Upload progress: 100%|████████████████████| 1.52M/1.52M [00:00<00:00, 1.81Mit/s]\n",
+      "Uploaded file from sport2_prepared_train.jsonl: file-Dxx2xJqyjcwlhfDHpZdmCXlF\n",
+      "Upload progress: 100%|███████████████████████| 388k/388k [00:00<00:00, 507kit/s]\n",
+      "Uploaded file from sport2_prepared_valid.jsonl: file-Mvb8YAeLnGdneSAFcfiVcgcN\n",
+      "Created fine-tune: ft-2zaA7qi0rxJduWQpdvOvmGn3\n",
       "Streaming events until fine-tuning is complete...\n",
       "\n",
       "(Ctrl-C will interrupt the stream, but not cancel the fine-tune)\n",
-      "[2021-07-26 12:13:52] Created fine-tune: ft-T4UkKqMbMM1Eu56q8ks6g8u5\n",
-      "[2021-07-26 12:13:57] Fine-tune enqueued. Queue number: 0\n",
-      "[2021-07-26 12:14:00] Fine-tune started\n",
-      "[2021-07-26 12:16:56] Completed epoch 1/4\n",
-      "[2021-07-26 12:18:37] Completed epoch 2/4\n",
-      "[2021-07-26 12:20:29] Completed epoch 3/4\n",
-      "[2021-07-26 12:22:31] Completed epoch 4/4\n",
-      "[2021-07-26 12:24:02] Uploaded model: ada:ft-openai-internal-2021-07-26-11-24-00\n",
-      "[2021-07-26 12:24:06] Uploaded result file: file-ForZ3pSAQ6db7bxmMJhw6GEo\n",
-      "[2021-07-26 12:24:07] Fine-tune succeeded\n",
+      "[2021-07-30 13:15:50] Created fine-tune: ft-2zaA7qi0rxJduWQpdvOvmGn3\n",
+      "[2021-07-30 13:15:52] Fine-tune enqueued. Queue number: 0\n",
+      "[2021-07-30 13:15:56] Fine-tune started\n",
+      "[2021-07-30 13:18:55] Completed epoch 1/4\n",
+      "[2021-07-30 13:20:47] Completed epoch 2/4\n",
+      "[2021-07-30 13:22:40] Completed epoch 3/4\n",
+      "[2021-07-30 13:24:31] Completed epoch 4/4\n",
+      "[2021-07-30 13:26:22] Uploaded model: ada:ft-openai-2021-07-30-12-26-20\n",
+      "[2021-07-30 13:26:27] Uploaded result file: file-6Ki9RqLQwkChGsr9CHcr1ncg\n",
+      "[2021-07-30 13:26:28] Fine-tune succeeded\n",
       "\n",
       "Job complete! Status: succeeded 🎉\n",
       "Try out your fine-tuned model:\n",
       "\n",
-      "openai api completions.create -m ada:ft-openai-internal-2021-07-26-11-24-00 -p <YOUR_PROMPT>\n"
+      "openai api completions.create -m ada:ft-openai-2021-07-30-12-26-20 -p <YOUR_PROMPT>\n"
      ]
     }
    ],
@@ -340,7 +349,7 @@
   {
    "cell_type": "markdown",
    "source": [
-    "The model is successfully trained in about ten minutes. We can see the model name is `ada:ft-openai-internal-2021-07-26-11-24-00`, which we can use for doing inference."
+    "The model is successfully trained in about ten minutes. We can see the model name is `ada:ft-openai-2021-07-30-12-26-20`, which we can use for doing inference."
    ],
    "metadata": {}
   },
@@ -354,16 +363,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 10,
    "source": [
-    "!openai api fine_tunes.results -i ft-T4UkKqMbMM1Eu56q8ks6g8u5 > result.csv"
+    "!openai api fine_tunes.results -i ft-2zaA7qi0rxJduWQpdvOvmGn3 > result.csv"
    ],
    "outputs": [],
    "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 11,
    "source": [
     "results = pd.read_csv('result.csv')\n",
     "results[results['classification/accuracy'].notnull()].tail(1)"
@@ -374,19 +383,19 @@
      "data": {
       "text/plain": [
        "     step  elapsed_tokens  elapsed_examples  training_loss  \\\n",
-       "926   927         3108476              3708       0.022579   \n",
+       "929   930         3027688              3720       0.044408   \n",
        "\n",
        "     training_sequence_accuracy  training_token_accuracy  \\\n",
-       "926                         1.0                      1.0   \n",
+       "929                         1.0                      1.0   \n",
        "\n",
        "     classification/accuracy  classification/precision  classification/recall  \\\n",
-       "926                 0.995833                       1.0               0.991667   \n",
+       "929                 0.991597                  0.983471                    1.0   \n",
        "\n",
        "     classification/auroc  classification/auprc  classification/f1.0  \\\n",
-       "926               0.99875              0.998909             0.995816   \n",
+       "929                   1.0                   1.0             0.991667   \n",
        "\n",
        "     validation_loss  validation_sequence_accuracy  validation_token_accuracy  \n",
-       "926              NaN                           NaN                        NaN  "
+       "929              NaN                           NaN                        NaN  "
       ],
       "text/html": [
        "<div>\n",
@@ -426,19 +435,19 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>926</th>\n",
-       "      <td>927</td>\n",
-       "      <td>3108476</td>\n",
-       "      <td>3708</td>\n",
-       "      <td>0.022579</td>\n",
+       "      <th>929</th>\n",
+       "      <td>930</td>\n",
+       "      <td>3027688</td>\n",
+       "      <td>3720</td>\n",
+       "      <td>0.044408</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.991597</td>\n",
+       "      <td>0.983471</td>\n",
        "      <td>1.0</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>0.995833</td>\n",
        "      <td>1.0</td>\n",
        "      <td>0.991667</td>\n",
-       "      <td>0.99875</td>\n",
-       "      <td>0.998909</td>\n",
-       "      <td>0.995816</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -449,7 +458,7 @@
       ]
      },
      "metadata": {},
-     "execution_count": 17
+     "execution_count": 11
     }
    ],
    "metadata": {}
@@ -463,7 +472,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 12,
    "source": [
     "results[results['classification/accuracy'].notnull()]['classification/accuracy'].plot()"
    ],
@@ -476,7 +485,7 @@
       ]
      },
      "metadata": {},
-     "execution_count": 18
+     "execution_count": 12
     },
     {
      "output_type": "display_data",
@@ -484,7 +493,7 @@
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ],
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbEUlEQVR4nO3deXCc9Z3n8ffXumzLlmRbsrAOX2DARlxG+AhJYAMshk04E4LBJtlih3+Gmdns7G5BzVZ2lqqtra2amuxMFZsdZjY7Y5mbEOKwDiQBMuxMJNsyh/GNMG5dPuRLkmXr/u4f/dhphGw1dktP99OfV1UX/TzPj+5vP3748Pg5vo+5OyIikvkmhV2AiIikhgJdRCQiFOgiIhGhQBcRiQgFuohIROSG9cWlpaU+f/78sL5eRCQjbd269Yi7l422LLRAnz9/Po2NjWF9vYhIRjKz2LmW6ZCLiEhEKNBFRCJCgS4iEhFjBrqZ/cTMDpvZ9nMsNzP7azNrMrNtZrY09WWKiMhYktlD/3tg1XmW3wksCl6PAz+++LJEROTLGjPQ3f094Nh5htwDrPO4BqDEzOakqkAREUlOKo6hVwItCdOtwTwREZlAE3odupk9TvywDHPnzp3IrxYZ0/Gefna0d7GjvZOevsGwy5EIu3VxOddWl6T8c1MR6G1AdcJ0VTDvC9z9WeBZgNraWjVil9AcOdnH9rbO4NXF9vZOWo+fPrvcLMTiJPJmF01O20DfADxhZi8Cy4FOdz+Qgs8VSYlDXb1sb+vk4zPh3dbJwa7es8sXlBZyXXUJa1fMo6aymKsqiiiZmh9ixSIXZsxAN7MXgFuAUjNrBf4zkAfg7v8L2AjcBTQBp4B/PV7FipyPu9Pe2Zuw593J9vYuOrr7gPhe96Vl01ixcCY1lcXUVBazpKKIosl5IVcukhpjBrq7rx5juQN/mLKKRJLg7rQcO8329jN73p3saO/iWE8/ADmTjEWzp/H1RWVcXVlETWUxi+cUUVgQWvsikXGnrVvS3vCws/9oD9vbuz63993VGz9xmZdjXF4+ndsXl1NTVUxNRRGL5xQxOS8n5MpFJpYCXdLK0LCzr+NkfM+7NX6ycmd7FyeDq07ycyex+JLpfPPaCmoqirm6spjLL5lGQa7CW0SBLqEZGBqm6fDJzx3v3tnexemBIQAm501iyZwi7l9aSU1F/Jj3ovJp5OWoBZHIaBToMiH6B4fZe6j791ebtHex+0AXfYPDABTm53BVRTEPLauO73lXFbOwtJBchbdI0hToknK9A0PsPtidsOfdyZ6D3QwMxW89mF6Qy1WVRTy6ct7Zq00WzCpk0iRd/C1yMRToclFO9Q+y60AX29u6zl5t8snhkwwNx8O7ZGoeNRXFPPbVhdRUFnF1ZTHVM6YqvEXGgQJdktbdO8DO9q7PXW3yacdJguxmVmE+NZXF3La4nJrgUsHKkimYbrsUmRAKdBlV5+kBdgSHSz5u62JHWyf7jvScXV5eVEBNRTF3XT2Hmsr41SblRQUKb5EQKdCFYz39Z491n+lt0nzs1NnllSVTuKqiiPuur4zfGl9ZxOzpk0OsWERGo0DPMoe7e9kR9DP5OLi7su3E75tSzZ05lZrKorNXm9RUFjOzUH1NRDKBAj3ijvX08w+/2392D/xQV9/ZZQtLC1k6bwbf+8o8aiqKuaqimOKp6msikqkU6BH3Zz/7mDd3HOSysml85dLS+GWCFUUsqShiuppSiUSKAj3CDnb28qudh3j8awt56q7FYZcjIuNMt+FF2PObmxl25+HlejqUSDZQoEfUwNAwL2xu5ubLy5g3qzDsckRkAijQI+pXOw7R0d3HoyvnhV2KiEwQBXpEravfT9WMKdx8+eywSxGRCaJAj6C9h7rZ9Nkx1qyYR456pohkDQV6BNXVx8jPncSDtdVhlyIiE0iBHjEn+wZ57f1WvnnNHN3hKZJlFOgR87P3W+npH2LtCp0MFck2CvQIcXfqGmJcXVnMddUlYZcjIhNMgR4hmz47xt5DJ1m7Yp7a2IpkIQV6hNQ1xCiekse3rq0IuxQRCYECPSIOd/Xy1vaDfOeGKqbk54RdjoiEQIEeES9sbmFw2HlEJ0NFspYCPQIGhoZ5fnOMr19exoJS9W0RyVYK9Aj4zc5DHOrq06WKIllOgR4BdQ0xKkum8I0r1bdFJJsp0DNc0+FufvfpUR5ePld9W0SynAI9w61vaCY/ZxLfvVF9W0SynQI9g/X0DfLTra3cdfUllE4rCLscEQlZUoFuZqvMbI+ZNZnZk6Msn2dmb5vZNjP7rZlVpb5UGen1D9vo7htk7cr5YZciImlgzEA3sxzgGeBOYAmw2syWjBj2F8A6d78GeBr4b6kuVD7P3amrj7FkThFL55aEXY6IpIFk9tCXAU3uvs/d+4EXgXtGjFkCvBO8f3eU5ZJijbHj7D7YzdqV6tsiInHJBHol0JIw3RrMS/QRcH/w/j5gupnNGvlBZva4mTWaWWNHR8eF1CuBuvoY0yfncs916tsiInGpOin674GbzewD4GagDRgaOcjdn3X3WnevLSsrS9FXZ5+O7j5+uf0A376hiqn5uWGXIyJpIpk0aAMSr4mrCuad5e7tBHvoZjYNeMDdT6SoRhnhpS3NDAw5a3RnqIgkSGYPfQuwyMwWmFk+8BCwIXGAmZWa2ZnPegr4SWrLlDMGh4Z5blMzX72slEvLpoVdjoikkTED3d0HgSeAt4BdwMvuvsPMnjazu4NhtwB7zGwvUA7813GqN+u9vfswBzp7WbtSe+ci8nlJHYB1943AxhHzfpjw/lXg1dSWJqOpq49RUTyZW9W3RURG0J2iGeTTjpP8U9MRHl4+l9wc/dGJyOcpFTLI+oYYeTnGg+rbIiKjUKBniFP9g7y6tZVVNXOYPX1y2OWISBpSoGeIDR+20907yKM6GSoi56BAzwDuzrr6GFdeMp3aeTPCLkdE0pQCPQO833yCnQe61LdFRM5LgZ4B6ur3M70gl3uvG9lCR0Tk9xToae7IyT42fnyQB26oorBAfVtE5NwU6GnupS0t9A8Ns2bF3LBLEZE0p0BPY0PDzvObmvnKpbO4bPb0sMsRkTSnQE9j7+w+TNuJ06xVV0URSYICPY3VNcQoLyrgtiXlYZciIhlAgZ6m9h/p4b29HTy8bB556tsiIklQUqSp9Q0xcicZq5epb4uIJEeBnoZO9w/xytZW7qi5hNlF6tsiIslRoKehX3zUTufpAZ0MFZEvRYGeZtyddQ37ubx8GssXzAy7HBHJIAr0NPNhywm2t3WxdoX6tojIl6NATzN1DTEK83O4b2lV2KWISIZRoKeRYz39vLHtAPcvrWKa+raIyJekQE8jLze20D84zFo9xEJELoACPU0MDTvrG2IsXzCTy8vVt0VEvjwFepr4x72HaT1+WnvnInLBFOhpoq4+Rtn0Au646pKwSxGRDKVATwPNR0/x270drF42V31bROSCKT3SwHObYkwy4+FleoiFiFw4BXrIegeGeKmxhX+5pJxLitW3RUQunAI9ZG9sO8CJUwM6GSoiF02BHrK6+v1cWlbIyoWzwi5FRDKcAj1EH7Wc4KPWTvVtEZGUUKCHqK4hxtT8HO6/QX1bROTiJRXoZrbKzPaYWZOZPTnK8rlm9q6ZfWBm28zsrtSXGi3He/r5xUft3Ht9JUWT88IuR0QiYMxAN7Mc4BngTmAJsNrMlowY9p+Al939euAh4H+mutCoeXVrK32Dw3qIhYikTDJ76MuAJnff5+79wIvAPSPGOFAUvC8G2lNXYvQMDzvrN8W4cf4MFs8pGvtfEBFJQjKBXgm0JEy3BvMS/TmwxsxagY3AH432QWb2uJk1mlljR0fHBZQbDe990kHs6CnWrpwfdikiEiGpOim6Gvh7d68C7gLqzOwLn+3uz7p7rbvXlpWVpeirM09dfYzSaQWsUt8WEUmhZAK9DahOmK4K5iV6DHgZwN3rgclAaSoKjJqWY6d4Z89hVi+rJj9XFxmJSOokkyhbgEVmtsDM8omf9NwwYkwzcCuAmS0mHujZe0zlPJ7b1IwBq9W3RURSbMxAd/dB4AngLWAX8atZdpjZ02Z2dzDsT4E/MLOPgBeA77u7j1fRmap3YIiXG1u4fUk5FSVTwi5HRCImqQdXuvtG4ic7E+f9MOH9TuCm1JYWPRs/PsCxnn7WrpgfdikiEkE6iDuB6hpiLCwt5CuXqm+LiKSeAn2CbG/r5IPmE6xZMY9Jk9S3RURST4E+QerqY0zJy+EB9W0RkXGiQJ8AnacG+PlHbdx7fQXFU9S3RUTGhwJ9AryytYXegWHWqG+LiIwjBfo4Gx52ntvUzA3zZnBVRXHY5YhIhCnQx9k/NR3hsyM96qooIuNOgT7O6hpizCrM586r1bdFRMaXAn0ctZ04zdu7DvHdG6spyM0JuxwRiTgF+jh6flMMgIeXq2+LiIw/Bfo46Rsc4sXNLXzjynKqZkwNuxwRyQIK9HHy5vaDHO3pZ+1KnQwVkYmhQB8ndfUx5s+aytcuU1t4EZkYCvRxsLO9i8bYcfVtEZEJpUAfB3UNMSbnTeI7N1SPPVhEJEUU6CnWeXqA1z9o4+5rKyieqr4tIjJxFOgp9tr7rZweGOLRlfPDLkVEsowCPYXcnbqGGNdVl1BTqb4tIjKxFOgp9LtPj7KvQ31bRCQcCvQUWle/nxlT8/hX18wJuxQRyUIK9BQ50HmaX+88xIM3VjM5T31bRGTiKdBT5PlNzTiwZrkOt4hIOBToKdA/OMwLm1v4F1fMpnqm+raISDgU6Cnw1o6DHDnZp74tIhIqBXoK1NXHmDtzKjcvKgu7FBHJYgr0i7T7YBeb9x9jzYq56tsiIqFSoF+kuvoY+bnq2yIi4VOgX4Tu3gF+9kEb37qmghmF+WGXIyJZToF+EV57v41T/UM8qpOhIpIGFOgX6Ezflmuqirm2uiTsckREFOgXqn7fUZoOn1TfFhFJG0kFupmtMrM9ZtZkZk+OsvxHZvZh8NprZidSXmmaWd8Qo2RqHt+6tiLsUkREAMgda4CZ5QDPALcDrcAWM9vg7jvPjHH3HySM/yPg+nGoNW0c6urlrR2HeOyrC9S3RUTSRjJ76MuAJnff5+79wIvAPecZvxp4IRXFpavnNzUz7M4jy+eGXYqIyFnJBHol0JIw3RrM+wIzmwcsAN45x/LHzazRzBo7Ojq+bK1pYWBomBc2N3Pz5WXMm1UYdjkiImel+qToQ8Cr7j402kJ3f9bda929tqwsM2+T/9WOQxzu7tPJUBFJO8kEehuQeBtkVTBvNA8R8cMtdQ37qSyZwi1XzA67FBGRz0km0LcAi8xsgZnlEw/tDSMHmdmVwAygPrUlpo+9h7pp2HeMNSvmkaO+LSKSZsYMdHcfBJ4A3gJ2AS+7+w4ze9rM7k4Y+hDworv7+JQavvUNMfJzJvFgbVXYpYiIfMGYly0CuPtGYOOIeT8cMf3nqSsr/ZzsG+S199v45jVzmDWtIOxyRES+QHeKJulnH7Rxsm+QNerbIiJpSoGeBHdnfX2MmsoirlffFhFJUwr0JGz+7Bh7DnWzdsU8zHQyVETSkwI9CXUNMYom53L3taPeTyUikhYU6GM43NXLm9sP8p3aaqbkq2+LiKQvBfoYXtzSwuCws0Z3hopImlOgn8fg0DDPb2rma4tKWVCqvi0ikt4U6Ofxm12HONjVq74tIpIRFOjnsa4+RmXJFG5dXB52KSIiY1Kgn0PT4W5+9+lRHl4+V31bRCQjKNDPYX1DM3k5xndvrB57sIhIGlCgj6Knb5Cfbm3lrqvnUKq+LSKSIRToo/j5h+109w3yqPq2iEgGUaCP4O6sq9/P4jlFLJ07I+xyRESSpkAfYWvsOLsPdvPoSvVtEZHMokAfYV19jOkFudxzXUXYpYiIfCkK9AQd3X38cvsBHrihiqn5ST37Q0QkbSjQE7y0pZmBIWetToaKSAZSoAfO9G256bJZXFo2LexyRES+NAV64O3dh2nv7GXtivlhlyIickEU6IH1DTHmFE/mtsWzwy5FROSCKNCBfR0n+X+fHOHhZXPJzdEqEZHMpPQioW/LMvVtEZHMlfWBfqp/kFe2trCqZg6zp08OuxwRkQuW9YG+4cN2unsH9RALEcl4WR3o8b4tMa4on86N89W3RUQyW1YH+vvNJ9h5oIu16tsiIhGQ1YG+viHGtIJc7r2+MuxSREQuWtYG+pGTffzfbQd4YGkl0wrUt0VEMl/WBvrLjS30Dw2zRidDRSQisjLQh4ad5xqaWblwFovKp4ddjohISiQV6Ga2ysz2mFmTmT15jjEPmtlOM9thZs+ntszUenf3YdpOnFZXRRGJlDEPHptZDvAMcDvQCmwxsw3uvjNhzCLgKeAmdz9uZmndEKWuIUZ5UQG3LykPuxQRkZRJZg99GdDk7vvcvR94EbhnxJg/AJ5x9+MA7n44tWWmzv4jPfzj3g5WL5tLnvq2iEiEJJNolUBLwnRrMC/R5cDlZvbPZtZgZqtG+yAze9zMGs2ssaOj48IqvkjPbYqRO8lYvWxuKN8vIjJeUrWLmgssAm4BVgN/a2YlIwe5+7PuXuvutWVlZSn66uSd7h/i5cZW7rjqEsqL1LdFRKIlmUBvAxLbEFYF8xK1AhvcfcDdPwP2Eg/4tPKLbe10nh7QpYoiEknJBPoWYJGZLTCzfOAhYMOIMa8T3zvHzEqJH4LZl7oyL567U1cfY9HsaaxYODPsckREUm7MQHf3QeAJ4C1gF/Cyu+8ws6fN7O5g2FvAUTPbCbwL/Ad3PzpeRV+Ij1o7+bitU31bRCSykrrn3d03AhtHzPthwnsH/l3wSkvr6vdTmJ/DferbIiIRlRXX7R3r6eeNbQe4b2kl0yfnhV2OiMi4yIpAf6Wxhf7BYdaumB92KSIi4ybygT407KzfFGPZgplccYn6tohIdEU+0N/b20HLsdN6xJyIRF7kA31d/X7Kphdwx1WXhF2KiMi4inSgNx89xW/3drD6xmrycyP9U0VEoh3oz22KMcmM1cvVt0VEoi+ygd47MMRLjS3cvricOcVTwi5HRGTcRTbQ39h2gBOnBnhUD7EQkSwR2UCva4hxaVkhKy+dFXYpIiITIpKBvq31BB+1nGDtCvVtEZHsEclAr6uPMSUvh/tvqAq7FBGRCRO5QD9xqp8NH7Vz7/WVFKlvi4hkkcgF+iuNrfQNDuvOUBHJOpEK9OGgb0vtvBksqSgKuxwRkQkVqUB/75MOYkdPsVaXKopIFopUoK9viFE6LZ9VNerbIiLZJzKB3nLsFG/vPsxDN86lIDcn7HJERCZcZAL9+c3NGKhvi4hkrUgEeu/AEC9taeG2xeVUlqhvi4hkp0gE+i+3H+BYT79OhopIVotEoNfVx1hYWshNl5aGXYqISGgyPtC3t3XyfvMJHlkxj0mT1LdFRLJXxgf6+oYYk/Mm8e2l6tsiItktowO989QAr3/Yxr3XVVI8VX1bRCS7ZXSgv/p+K70Dw6xR3xYRkcwN9OFhZ31DjKVzS6ipLA67HBGR0GVsoP/zp0f47EiPLlUUEQlkbKCvq48xszCfu66eE3YpIiJpISMDve3Ead7edYjv3litvi0iIoGMDPTnN8Vw4BH1bREROSupQDezVWa2x8yazOzJUZZ/38w6zOzD4PVvUl9qXN9gvG/LrVfOpmrG1PH6GhGRjJM71gAzywGeAW4HWoEtZrbB3XeOGPqSuz8xDjV+zpvbD3LkZL8uVRQRGSGZPfRlQJO773P3fuBF4J7xLevcCvNzuX1JOV9fVBZWCSIiaWnMPXSgEmhJmG4Flo8y7gEz+zqwF/iBu7eMMuai3baknNuWlI/HR4uIZLRUnRT9BTDf3a8Bfg38w2iDzOxxM2s0s8aOjo4UfbWIiEBygd4GVCdMVwXzznL3o+7eF0z+HXDDaB/k7s+6e62715aV6ZCJiEgqJRPoW4BFZrbAzPKBh4ANiQPMLPHunruBXakrUUREkjHmMXR3HzSzJ4C3gBzgJ+6+w8yeBhrdfQPwx2Z2NzAIHAO+P441i4jIKMzdQ/ni2tpab2xsDOW7RUQylZltdffa0ZZl5J2iIiLyRQp0EZGIUKCLiEREaMfQzawDiF3gv14KHElhOZlK6yFO6yFO6yE71sE8dx/1uu/QAv1imFnjuU4KZBOthzithzitB60DHXIREYkIBbqISERkaqA/G3YBaULrIU7rIU7rIcvXQUYeQxcRkS/K1D10EREZQYEuIhIRGRfoYz3fNCrMrNrM3jWznWa2w8z+JJg/08x+bWafBP+cEcw3M/vrYL1sM7Ol4f6C1DKzHDP7wMzeCKYXmNmm4Pe+FHQCxcwKgummYPn8UAtPITMrMbNXzWy3me0ys5XZtj2Y2Q+C/x62m9kLZjY5G7eFc8moQE94vumdwBJgtZktCbeqcTMI/Km7LwFWAH8Y/NYngbfdfRHwdjAN8XWyKHg9Dvx44kseV3/C59sy/3fgR+5+GXAceCyY/xhwPJj/o2BcVPwV8Ka7XwlcS3x9ZM32YGaVwB8Dte5eQ7z760Nk57YwOnfPmBewEngrYfop4Kmw65qg3/5z4g/q3gPMCebNAfYE7/8GWJ0w/uy4TH8Rf6jK28A3gDcAI343YO7I7YJ4m+eVwfvcYJyF/RtSsA6Kgc9G/pZs2h74/eMwZwZ/tm8Ad2TbtnC+V0btoTP6800rQ6plwgR/Vbwe2ASUu/uBYNFB4MwDVqO8bv4H8B+B4WB6FnDC3QeD6cTfenY9BMs7g/GZbgHQAfyf4NDT35lZIVm0Pbh7G/AXQDNwgPif7Vayb1s4p0wL9KxjZtOAnwL/1t27Epd5fNcj0tedmtk3gcPuvjXsWkKWCywFfuzu1wM9/P7wChD97SE4P3AP8f+5VQCFwKpQi0ozmRboYz7fNErMLI94mD/n7q8Fsw+deeRf8M/DwfyorpubgLvNbD/wIvHDLn8FlJjZmSduJf7Ws+shWF4MHJ3IgsdJK9Dq7puC6VeJB3w2bQ+3AZ+5e4e7DwCvEd8+sm1bOKdMC/Qxn28aFWZmwP8Gdrn7XyYs2gB8L3j/PeLH1s/MfzS4umEF0JnwV/GM5e5PuXuVu88n/uf9jrs/ArwLfDsYNnI9nFk/3w7GZ/xeq7sfBFrM7Ipg1q3ATrJre2gGVpjZ1OC/jzPrIKu2hfMK+yD+l30BdwF7gU+BPwu7nnH8nV8l/tfnbcCHwesu4scA3wY+AX4DzAzGG/ErgD4FPiZ+JUDovyPF6+QW4I3g/UJgM9AEvAIUBPMnB9NNwfKFYdedwt9/HdAYbBOvAzOybXsA/guwG9gO1AEF2bgtnOulW/9FRCIi0w65iIjIOSjQRUQiQoEuIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIR8f8BOpHLTjKrpzgAAAAASUVORK5CYII="
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZKUlEQVR4nO3de5BU55nf8e8zdxhguMxwm0GALCSELggYyXKktWVdbAlZQhcEUmUr65R39c9qd+NskpKSlOOoKpVK1Va8u1WKE2XXu/FWYk2DLkYya2RL8trrkuTu4Spu0hgsnR4GZrjDAHPrJ3/MQdseDUwD3XO6T/8+VV30Oeel++kzhx9n3vf0e8zdERGR0lcRdQEiIpIfCnQRkZhQoIuIxIQCXUQkJhToIiIxURXVGzc2NvqCBQuiensRkZLU3t5+2N2bRtsWWaAvWLCAVCoV1duLiJQkM/v4QtvU5SIiEhMKdBGRmBgz0M3se2bWbWYfXGC7mdlfmlmHmW03s+X5L1NERMaSyxn63wL3X2T7A8Ci8PE08N0rL0tERC7VmIHu7j8Hjl6kySrg+z7sPWCqmc3JV4EiIpKbfPShNwNB1nI6XPcZZva0maXMLNXT05OHtxYRkfPGdVDU3V9091Z3b21qGvUyShERuUz5uA69E5iXtdwSrpMikMk433/3Nxzt7Y+6FBEJ3XP9LJbOm5r3181HoG8AnjGzl4DPAyfcvSsPryt58IuOw3z79V0AmEVcjIgAMHNKXTSBbmY/AO4CGs0sDfwnoBrA3f8nsBFYCXQAZ4B/mfcq5bIlkgHTJlbz3r+/h9qqyqjLEZECGjPQ3f2pMbY78Id5q0jy5mhvP2/uOsjv3j5fYS5SBvRN0Rh7dUsnA0PO2lvnjd1YREqeAj2m3J11qYClLQ0snj0l6nJEZBwo0GNqe/oEew6eYo3OzkXKhgI9ptpSAXXVFTy0dG7UpYjIOFGgx9DZ/iFe33qAlTfOYUpdddTliMg4UaDH0MYdXZzqG1R3i0iZUaDHUFsqYMGMiXx+4fSoSxGRcaRAj5n9h3v51f6jPNE6D9NXQ0XKigI9ZhKpgAqD1Staoi5FRMaZAj1GBocyvNye5svXzWTWlLqoyxGRcaZAj5F/+LCH7lN9GgwVKVMK9BhpSwY0Tqrh7sUzoy5FRCKgQI+JnlN9vL2nm8eWt1BdqR+rSDnSv/yYeGVzmsGMs6ZV3S0i5UqBHgPuTlsqYMX8aVwzc1LU5YhIRBToMdD+8TH29fSyVmfnImVNgR4DiVRAfU0lD948J+pSRCRCCvQSd7pvkDe2d/G1m+dSX5uPW8SKSKlSoJe4H20/wJn+Idbcqm+GipQ7BXqJa0sGfK6pnuVXTYu6FBGJmAK9hHV0n2LzJ8dZe6sm4hIRBXpJa0sGVFUYjy1Xd4uIKNBLVv9ghlc2d3LP9TNpnFQbdTkiUgQU6CXq7T3dHOntZ60m4hKRkAK9RCVSAbOm1PLFRU1RlyIiRUKBXoIOnjjHz/Z2s3pFC1WaiEtEQkqDEvTy5jQZhydWqLtFRP6JAr3EZDJOIhXw+YXTWdBYH3U5IlJEFOgl5v39R/n4yBkNhorIZyjQS0wiFTC5tooHbtREXCLy2xToJeTkuQE27uji4VvmMqGmMupyRKTI5BToZna/me01sw4ze3aU7fPN7C0z225mPzMzfXWxADZsPUDfYEbdLSIyqjED3cwqgReAB4AlwFNmtmREsz8Dvu/uNwPPA/8134XKcHfL4tmTuam5IepSRKQI5XKGfhvQ4e773L0feAlYNaLNEuDt8Pk7o2yXK7S76yTb0ydY06qJuERkdLkEejMQZC2nw3XZtgGPhc8fBSab2YyRL2RmT5tZysxSPT09l1Nv2WpLBtRUVvDospG7XkRkWL4GRf8N8CUz2wJ8CegEhkY2cvcX3b3V3VubmvSV9Vz1DQ7x2tZO7rthFtPqa6IuR0SKVC73LOsEskfhWsJ1n3L3A4Rn6GY2CXjc3Y/nqcay9+bOQxw/M6CbQIvIReVyhp4EFpnZQjOrAZ4ENmQ3MLNGMzv/Ws8B38tvmeUtkQponjqBO69pjLoUESliYwa6uw8CzwCbgN1Awt13mtnzZvZw2OwuYK+ZfQjMAv5LgeotO+ljZ/jHjsOsXtFCRYUGQ0XkwnK6Tby7bwQ2jlj3razn64H1+S1NANa3pwF4olWX9ovIxembokUsk3HWpdLc8blGWqZNjLocESlyCvQi9stfH6bz+FnW6JuhIpIDBXoRa0sGNEyo5itLZkVdioiUAAV6kTrW28+bOw/x6LJm6qo1EZeIjE2BXqR+uLWT/qEMa3TtuYjkSIFehNydtlSam5obWDJ3StTliEiJUKAXoQ86T7K766QGQ0XkkijQi1Bb6hNqqyp4eOncqEsRkRKiQC8y5waG+OHWAzxw42waJlRHXY6IlBAFepH5+w+6OHVuUN0tInLJFOhFpi0ZcNX0idy+8DPTyYuIXJQCvYh8fKSX9/YdZU2rJuISkUunQC8i61JpKgxWr1B3i4hcOgV6kRjKOOvb03zp2iZmN9RFXY6IlCAFepH4+Yc9HDx5Tt8MFZHLpkAvEm3JgBn1NdxzvSbiEpHLo0AvAodP9/HT3cMTcdVU6UciIpdH6VEEXt3cyWDGWatrz0XkCijQI+buJFIBy66ayqJZk6MuR0RKmAI9YluC43zUfZq1GgwVkSukQI9YIhkwobqSr2kiLhG5Qgr0CPX2DfL6tgM8ePMcJtVWRV2OiJQ4BXqEfrSji97+IQ2GikheKNAjlEgGXN1UT+v8aVGXIiIxoECPSEf3aVIfH2NN6zzMNBGXiFw5BXpE1rUHVFYYjy1vjroUEYkJBXoEBoYyvNzeyd2LZzJzsibiEpH8UKBH4J093Rw+3adrz0UkrxToEUikApom13LXdU1RlyIiMaJAH2fdJ8/xzt4eHl/eQlWldr+I5I8SZZyt35xmKOOsaW2JuhQRiRkF+jhyd9al0ty2YDpXN02KuhwRiZmcAt3M7jezvWbWYWbPjrL9KjN7x8y2mNl2M1uZ/1JLX/I3x9h/uJc1+maoiBTAmIFuZpXAC8ADwBLgKTNbMqLZfwQS7r4MeBL4H/kuNA7akgGTaqtYedPsqEsRkRjK5Qz9NqDD3fe5ez/wErBqRBsHpoTPG4AD+SsxHk6dG2Djji4eWjqXiTWaiEtE8i+XQG8GgqzldLgu27eB3zWzNLAR+KPRXsjMnjazlJmlenp6LqPc0vX6ti7ODmgiLhEpnHwNij4F/K27twArgb8zs8+8tru/6O6t7t7a1FRe12C3pQKunTWJpS0NUZciIjGVS6B3AtmnlS3humzfABIA7v4uUAc05qPAONh78BTbguOaiEtECiqXQE8Ci8xsoZnVMDzouWFEm0+AewDM7HqGA728+lQuoi0ZUF1pPLZc156LSOGMGejuPgg8A2wCdjN8NctOM3vezB4Om/0p8Admtg34AfB1d/dCFV1K+gczvLolzX1LZjG9vibqckQkxnK63MLdNzI82Jm97ltZz3cBd+S3tHj46e5DHDszwBpNxCUiBaZvihZYWzJgbkMdv7OovAaBRWT8KdAL6MDxs/z8ox5Wr2ihskKDoSJSWAr0AlrfnsYdVq9Qd4uIFJ4CvUAyGSeRCvhnn5vBVTMmRl2OiJQBBXqBvLvvCOljZ/XNUBEZNwr0AkmkAqbUVfHVGzQRl4iMDwV6AZw4M8Dff3CQR5Y1U1ddGXU5IlImFOgF8MNtnfQPZnTtuYiMKwV6AbQlA26YO4UbmzURl4iMHwV6nn3QeYKdB07q7FxExp0CPc8SqYCaqgoeuWXklPEiIoWlQM+jcwNDvLalk/tvmE3DxOqoyxGRMqNAz6NNOw9y8tygrj0XkUgo0PMokQqYN30CX7h6RtSliEgZUqDnSXD0DL/sOMITK+ZRoYm4RCQCCvQ8WZcKMIPVK3RXIhGJhgI9D4Yyzrr2NL+zqIm5UydEXY6IlCkFeh784qMeuk6cY62uPReRCCnQ8yCRCpg2sZp7l8yMuhQRKWMK9Ct0tLefn+w6xKPLWqit0kRcIhIdBfoVenVLJwNDrmvPRSRyCvQr4O4kkgFL503lutmToy5HRMqcAv0KbEufYO+hUxoMFZGioEC/Am3JgLrqCh5aOifqUkREFOiX60z/IK9vO8DKm+YwuU4TcYlI9BTol2njjoOc7htUd4uIFA0F+mVKpAIWNtZz28LpUZciIgIo0C/L/sO9/Gr/UZ5obcFME3GJSHFQoF+GRCqgssJYvVwTcYlI8VCgX6LBoQwvt6f58nVNzJxSF3U5IiKfUqBfop/t7aH7VJ9uAi0iRSenQDez+81sr5l1mNmzo2z/jpltDR8fmtnxvFdaJNpSAY2TavnyYk3EJSLFpWqsBmZWCbwA3AekgaSZbXD3XefbuPs3s9r/EbCsALVGrvvUOd7e083v37mQ6kr9ciMixSWXVLoN6HD3fe7eD7wErLpI+6eAH+SjuGLz6uZOhjLOE+puEZEilEugNwNB1nI6XPcZZjYfWAi8feWlFRd3py0V0Dp/GtfMnBR1OSIin5HvfoMngfXuPjTaRjN72sxSZpbq6enJ81sXVvvHx9jX08saTZMrIkUql0DvBLJTrCVcN5onuUh3i7u/6O6t7t7a1NSUe5VFoC0ZUF9TyYM3aSIuESlOuQR6ElhkZgvNrIbh0N4wspGZLQamAe/mt8Tone4b5Ec7unho6Vzqa8ccRxYRicSYge7ug8AzwCZgN5Bw951m9ryZPZzV9EngJXf3wpQanTe2HeBM/5AGQ0WkqOV0uunuG4GNI9Z9a8Tyt/NXVnFpSwVcM3MSy6+aGnUpIiIXpIupx/DRoVNs+eQ4a1vnaSIuESlqCvQxJFIBVRXGo8tHvVJTRKRoKNAvon8wwyubO7n3+lk0TqqNuhwRkYtSoF/E23sOcaS3n7W69lxESoAC/SLakgGzp9TxxWtL65p5ESlPCvQLOHjiHP/wYQ+Pr2imskKDoSJS/BToF7C+PSDjaN5zESkZCvRRZDJOIpXm9qunM39GfdTliIjkRIE+ivf3H+WTo2c0GCoiJUWBPopEKmByXRUP3KiJuESkdCjQRzhxdoCNO7pYdctc6qoroy5HRCRnCvQRNmw7QN9ghrWtV0VdiojIJVGgj5BIBiyePZkbm6dEXYqIyCVRoGfZdeAkOzpPsPZWTcQlIqVHgZ4lkQqoqazgkVs0EZeIlB4FeqhvcIjXtnbylRtmMa2+JupyREQumQI99ObOQxw/M6Brz0WkZCnQQ4lUQPPUCdzxucaoSxERuSwKdCB97Az/2HGYJ1pbqNBEXCJSohTowLpUGoDVK1oirkRE5PKVfaAPZZz17WnuvKaRlmkToy5HROSylX2g/7LjMJ3Hz2qaXBEpeWUf6IlUwNSJ1XzlhllRlyIickXKOtCP9fbz5s5DPHJLM7VVmohLREpbWQf6a1s76R/K6NpzEYmFsg10d6ctGXBzSwPXz9FEXCJS+so20Hd0nmDPwVMaDBWR2CjbQG9LBtRWVfDQ0rlRlyIikhdlGehn+4fYsPUAK2+aQ8OE6qjLERHJi7IM9B/v7OJU36C6W0QkVsoy0NuSAfNnTOT2q6dHXYqISN6UXaB/fKSX9/YdZU2r7kokIvGSU6Cb2f1mttfMOszs2Qu0WWNmu8xsp5n9v/yWmT+JVECFwePLNRGXiMRL1VgNzKwSeAG4D0gDSTPb4O67stosAp4D7nD3Y2Y2s1AFX4nBoQzr29Pcdd1MZjfURV2OiEhe5XKGfhvQ4e773L0feAlYNaLNHwAvuPsxAHfvzm+Z+fHzj3o4dLKPNa06OxeR+Mkl0JuBIGs5Ha7Ldi1wrZn90szeM7P7R3shM3vazFJmlurp6bm8iq9AWzJgRn0Ndy/WRFwiEj/5GhStAhYBdwFPAf/bzKaObOTuL7p7q7u3NjU15emtc3P4dB9v7e7mseXN1FSV3ViwiJSBXJKtE8i+YLslXJctDWxw9wF33w98yHDAF41XN3cymHFNxCUisZVLoCeBRWa20MxqgCeBDSPavMbw2Tlm1shwF8y+/JV5ZdydtlTA8qumcs3MyVGXIyJSEGMGursPAs8Am4DdQMLdd5rZ82b2cNhsE3DEzHYB7wD/1t2PFKroS7X5k+N0dJ/W2bmIxNqYly0CuPtGYOOIdd/Keu7Avw4fRSeRDJhYU8mDN2siLhGJr9iPDvb2DfLG9gM8eNMcJtXm9P+XiEhJin2g/2h7F739Q+puEZHYi32gJ1IBVzfVs2L+tKhLEREpqFgHekf3aVIfH2OtJuISkTIQ60BflwqoqjAe00RcIlIGYhvoA0MZXt6c5u7FM2maXBt1OSIiBRfbQH97TzeHT/drMFREykZsAz2RDJg5uZYvXTu+c8aIiEQlloF+6OQ53tnbzeMrWqiqjOVHFBH5jFim3cub02Qc3QRaRMpK7ALd3VmXSnPbwuksbKyPuhwRkXETu0D/1f6j7D/cy1qdnYtImYldoLelAibXVrHypjlRlyIiMq5iFegnzw2wcUcXD90ylwk1lVGXIyIyrmIV6K9vO8C5gYwGQ0WkLMUq0BPJgOtmTWZpS0PUpYiIjLvYBPqegyfZlj7Bmls1EZeIlKfYBHoimaa60nh0WXPUpYiIRCIWgd43OMSrW9J8ZclsptfXRF2OiEgkYhHoP93VzbEzA6zRRFwiUsZiEehtqYC5DXXceU1j1KWIiESm5AO98/hZfvFRD6tb51FZocFQESlfJR/o61Np3OGJFborkYiUt5IO9EzGWdcecMc1M5g3fWLU5YiIRKqkA/3dfUdIHzurb4aKiFDigd6WDGiYUM1Xb5gddSkiIpEr2UA/cWaAH+88yCO3zKWuWhNxiYiUbKC/trWT/sGMrj0XEQmVbKC3JQNubJ7CDXM1EZeICJRooH/QeYJdXSc1GCoikqUkAz2RCqipqmDVUk3EJSJyXk6Bbmb3m9leM+sws2dH2f51M+sxs63h4/fzX+qwcwNDvLalkwdunE3DxOpCvY2ISMmpGquBmVUCLwD3AWkgaWYb3H3XiKZt7v5MAWr8LZt2HuTkuUHdBFpEZIRcztBvAzrcfZ+79wMvAasKW9aF1ddUcd+SWdx+9YyoShARKUpjnqEDzUCQtZwGPj9Ku8fN7IvAh8A33T0Ypc0Vu3fJLO5dMqsQLy0iUtLyNSj6OrDA3W8GfgL8n9EamdnTZpYys1RPT0+e3lpERCC3QO8EsjusW8J1n3L3I+7eFy7+FbBitBdy9xfdvdXdW5uami6nXhERuYBcAj0JLDKzhWZWAzwJbMhuYGZzshYfBnbnr0QREcnFmH3o7j5oZs8Am4BK4HvuvtPMngdS7r4B+GMzexgYBI4CXy9gzSIiMgpz90jeuLW11VOpVCTvLSJSqsys3d1bR9tWkt8UFRGRz1Kgi4jEhAJdRCQmIutDN7Me4OPL/OuNwOE8llOqtB+0D87TfiiffTDf3Ue97juyQL8SZpa60KBAOdF+0D44T/tB+wDU5SIiEhsKdBGRmCjVQH8x6gKKhPaD9sF52g/aB6XZhy4iIp9VqmfoIiIyggJdRCQmSi7Qx7q/aVyY2Twze8fMdpnZTjP7k3D9dDP7iZl9FP45LVxvZvaX4X7ZbmbLo/0E+WNmlWa2xczeCJcXmtn74WdtC2cBxcxqw+WOcPuCSAvPIzObambrzWyPme02sy+U6bHwzfDfwwdm9gMzqyvH4+FCSirQs+5v+gCwBHjKzJZEW1XBDAJ/6u5LgNuBPww/67PAW+6+CHgrXIbhfbIofDwNfHf8Sy6YP+G3p2T+b8B33P0a4BjwjXD9N4Bj4frvhO3i4i+AH7v7YmApw/ujrI4FM2sG/hhodfcbGZ799UnK83gYnbuXzAP4ArApa/k54Lmo6xqnz/5Dhm/UvReYE66bA+wNn/8v4Kms9p+2K+UHwzdUeQu4G3gDMIa/DVg18phgeIrnL4TPq8J2FvVnyMM+aAD2j/wsZXgsnL8d5vTw5/sG8NVyOx4u9iipM3RGv79pc0S1jJvwV8VlwPvALHfvCjcdBM7fYDWu++bPgX8HZMLlGcBxdx8Ml7M/56f7INx+Imxf6hYCPcDfhF1Pf2Vm9ZTZseDuncCfAZ8AXQz/fNspv+Phgkot0MuOmU0CXgb+lbufzN7mw6cesb3u1My+BnS7e3vUtUSsClgOfNfdlwG9/FP3ChD/YwEgHCNYxfB/cHOBeuD+SIsqMqUW6GPe3zROzKya4TD/v+7+Srj60Plb/oV/dofr47hv7gAeNrPfAC8x3O3yF8BUMzt/t63sz/npPgi3NwBHxrPgAkkDaXd/P1xez3DAl9OxAHAvsN/de9x9AHiF4WOk3I6HCyq1QB/z/qZxYWYG/DWw293/e9amDcDvhc9/j+G+9fPr/0V4hcPtwImsX8dLkrs/5+4t7r6A4Z/12+7+z4F3gNVhs5H74Py+WR22L/mzVnc/CARmdl246h5gF2V0LIQ+AW43s4nhv4/z+6GsjoeLiroT/1IfwErgQ+DXwH+Iup4Cfs47Gf4VejuwNXysZLgP8C3gI+CnwPSwvTF8BdCvgR0MXwkQ+efI4/64C3gjfH418CugA1gH1Ibr68LljnD71VHXncfPfwuQCo+H14Bp5XgsAP8Z2AN8APwdUFuOx8OFHvrqv4hITJRal4uIiFyAAl1EJCYU6CIiMaFAFxGJCQW6iEhMKNBFRGJCgS4iEhP/HxPg2XO9XdJVAAAAAElFTkSuQmCC"
      },
      "metadata": {
       "needs_background": "light"
@@ -503,9 +512,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 13,
    "source": [
-    "test = pd.read_json('sport1_prepared_valid.jsonl', lines=True)\n",
+    "test = pd.read_json('sport2_prepared_valid.jsonl', lines=True)\n",
     "test.head()"
    ],
    "outputs": [
@@ -575,16 +584,23 @@
       ]
      },
      "metadata": {},
-     "execution_count": 19
+     "execution_count": 13
     }
    ],
    "metadata": {}
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We need to use the same separator following the prompt which we used during fine-tuning. In this case it is `\\n\\n###\\n\\n`. Since we're concerned with classification, we want the temperature to be as low as possible, and we only require one token completion to determine the prediction of the model."
+   ],
+   "metadata": {}
+  },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 14,
    "source": [
-    "ft_model = 'ada:ft-openai-internal-2021-07-26-11-24-00'\n",
+    "ft_model = 'ada:ft-openai-2021-07-30-12-26-20'\n",
     "res = openai.Completion.create(model=ft_model, prompt=test['prompt'][0] + '\\n\\n###\\n\\n', max_tokens=1, temperature=0)\n",
     "res['choices'][0]['text']\n"
    ],
@@ -597,7 +613,7 @@
       ]
      },
      "metadata": {},
-     "execution_count": 30
+     "execution_count": 14
     }
    ],
    "metadata": {}
@@ -611,7 +627,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 15,
    "source": [
     "res = openai.Completion.create(model=ft_model, prompt=test['prompt'][0] + '\\n\\n###\\n\\n', max_tokens=1, temperature=0, logprobs=2)\n",
     "res['choices'][0]['logprobs']['top_logprobs'][0]"
@@ -621,14 +637,14 @@
      "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "<OpenAIObject at 0x7ff86896c728> JSON: {\n",
-       "  \" baseball\": -6.3311357,\n",
-       "  \" hockey\": -0.0018503045\n",
+       "<OpenAIObject at 0x7fe114e435c8> JSON: {\n",
+       "  \" baseball\": -7.6311407,\n",
+       "  \" hockey\": -0.0006307676\n",
        "}"
       ]
      },
      "metadata": {},
-     "execution_count": 29
+     "execution_count": 15
     }
    ],
    "metadata": {}
@@ -650,7 +666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 16,
    "source": [
     "sample_hockey_tweet = \"\"\"Thank you to the \n",
     "@Canes\n",
@@ -669,14 +685,14 @@
       ]
      },
      "metadata": {},
-     "execution_count": 28
+     "execution_count": 16
     }
    ],
    "metadata": {}
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 17,
    "source": [
     "sample_baseball_tweet=\"\"\"BREAKING: The Tampa Bay Rays are finalizing a deal to acquire slugger Nelson Cruz from the Minnesota Twins, sources tell ESPN.\"\"\"\n",
     "res = openai.Completion.create(model=ft_model, prompt=sample_baseball_tweet + '\\n\\n###\\n\\n', max_tokens=1, temperature=0, logprobs=2)\n",
@@ -691,17 +707,10 @@
       ]
      },
      "metadata": {},
-     "execution_count": 31
+     "execution_count": 17
     }
    ],
    "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [],
-   "outputs": [],
-   "metadata": {}
   }
  ],
  "metadata": {

@@ -201,6 +201,7 @@ class File:
         resp = openai.File.create(
             file=open(args.file),
             purpose=args.purpose,
+            model=args.model,
         )
         print(resp)
 
@@ -669,6 +670,11 @@ Mutually exclusive with `top_p`.""",
         help="Why are you uploading this file? (see https://beta.openai.com/docs/api-reference/ for purposes)",
         required=True,
     )
+    sub.add_argument(
+        "-m",
+        "--model",
+        help="Model for search indexing (e.g. 'ada'). Only meaningful if --purpose is 'search'.",
+    )
     sub.set_defaults(func=File.create)
 
     sub = subparsers.add_parser("files.get")

@@ -5,9 +5,9 @@ import textwrap
 import threading
 import time
 from typing import Any, Dict
+from urllib.parse import urlparse
 
 import requests
-from urllib.parse import urlparse
 
 import openai
 from openai import error, util
@@ -265,7 +265,12 @@ class RequestsClient(HTTPClient):
             err = "%s: %s" % (type(e).__name__, str(e))
         # Retry only timeout and connect errors; similar to urllib3 Retry
         elif isinstance(
-            e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError)
+            e,
+            (
+                requests.exceptions.Timeout,
+                requests.exceptions.ConnectionError,
+                requests.exceptions.ChunkedEncodingError,
+            ),
         ):
             msg = (
                 "Unexpected error communicating with OpenAI.  "

@@ -1,6 +1,7 @@
 import os
 import sys
 import pandas as pd
+import numpy as np
 
 from typing import NamedTuple, Optional, Callable, Any
 
@@ -567,7 +568,7 @@ def apply_necessary_remediation(df, remediation):
 def accept_suggestion(input_text, auto_accept):
     sys.stdout.write(input_text)
     if auto_accept:
-        sys.stdout.write("Y")
+        sys.stdout.write("Y\n")
         return True
     return input().lower() != "n"
 
@@ -638,6 +639,26 @@ def get_classification_hyperparams(df):
     return n_classes, pos_class
 
 
+def get_batch_size_suggestion(df, no_packing):
+    """
+    Suggest the batch size based on the number of examples after packing optionally is applied.
+    """
+    n_examples, n_characters = (
+        len(df),
+        df.completion.str.len().sum() + df.prompt.str.len().sum(),
+    )
+    BATCH_SIZE_TO_N_EXAMPLES_RATIO = 0.002
+    BATCH_SIZE_TO_N_CHARACTERS_RATIO = BATCH_SIZE_TO_N_EXAMPLES_RATIO / 10_000
+
+    if no_packing:
+        batch_size = BATCH_SIZE_TO_N_EXAMPLES_RATIO * n_examples
+    else:
+        batch_size = BATCH_SIZE_TO_N_CHARACTERS_RATIO * n_characters
+    batch_size = 2 ** int(np.log2(batch_size))
+    batch_size_suggestion = f" --batch_size {batch_size}"
+    return batch_size_suggestion
+
+
 def write_out_file(df, fname, any_remediations, auto_accept):
     """
     This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file.
@@ -653,11 +674,14 @@ def write_out_file(df, fname, any_remediations, auto_accept):
         if accept_suggestion(input_text, auto_accept):
             split = True
 
-    classification_params = ""
-    if ft_format == "classification" or (
+    no_packing = ft_format == "classification" or (
         ft_format == "conditional generation" and len(df) < 1000
-    ):
-        classification_params = " --no_packing"
+    )
+    additional_params = ""
+    if no_packing:
+        additional_params = " --no_packing"
+    additional_params += get_batch_size_suggestion(df, no_packing)
+
     common_prompt_suffix_new_line_handled = common_prompt_suffix.replace("\n", "\\n")
     common_completion_suffix_new_line_handled = common_completion_suffix.replace(
         "\n", "\\n"
@@ -672,7 +696,7 @@ def write_out_file(df, fname, any_remediations, auto_accept):
 
     if not any_remediations:
         sys.stdout.write(
-            f'\nYou can use your file for fine-tuning:\n> openai api fine_tunes.create -t "{fname}"{classification_params}\n\nAfter you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt.{optional_ending_string}\n'
+            f'\nYou can use your file for fine-tuning:\n> openai api fine_tunes.create -t "{fname}"{additional_params}\n\nAfter you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt.{optional_ending_string}\n'
         )
         estimate_fine_tuning_time(df)
 
@@ -692,13 +716,11 @@ def write_out_file(df, fname, any_remediations, auto_accept):
             )
 
             n_classes, pos_class = get_classification_hyperparams(df)
-            classification_params += " --compute_classification_metrics"
+            additional_params += " --compute_classification_metrics"
             if n_classes == 2:
-                classification_params += (
-                    f' --classification_positive_class "{pos_class}"'
-                )
+                additional_params += f' --classification_positive_class "{pos_class}"'
             else:
-                classification_params += f" --classification_n_classes {n_classes}"
+                additional_params += f" --classification_n_classes {n_classes}"
         else:
             assert len(fnames) == 1
             df[["prompt", "completion"]].to_json(
@@ -714,7 +736,7 @@ def write_out_file(df, fname, any_remediations, auto_accept):
             else f"After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt."
         )
         sys.stdout.write(
-            f'\nWrote modified file{files_string}`\nFeel free to take a look!\n\nNow use that file when fine-tuning:\n> openai api fine_tunes.create -t "{fnames[0]}"{valid_string}{classification_params}\n\n{separator_reminder}{optional_ending_string}\n'
+            f'\nWrote modified file{files_string}`\nFeel free to take a look!\n\nNow use that file when fine-tuning:\n> openai api fine_tunes.create -t "{fnames[0]}"{valid_string}{additional_params}\n\n{separator_reminder}{optional_ending_string}\n'
         )
         estimate_fine_tuning_time(df)
     else:

@@ -1,1 +1,1 @@
-VERSION = "0.10.2"
+VERSION = "0.10.3"

Commit 7febb755

Commit `7febb755`