Commit d1769c15
Changed files (2)
openai
openai/tests/test_long_examples_validator.py
@@ -0,0 +1,48 @@
+import json
+import subprocess
+from tempfile import NamedTemporaryFile
+
+
+def test_long_examples_validator() -> None:
+
+ """
+ Ensures that long_examples_validator() handles previously applied recommendations,
+ namely dropped duplicates, without resulting in a KeyError.
+ """
+
+ # data
+ short_prompt = "a prompt "
+ long_prompt = short_prompt * 500
+
+ short_completion = "a completion "
+ long_completion = short_completion * 500
+
+ # the order of these matters
+ unprepared_training_data = [
+ {"prompt": long_prompt, "completion": long_completion}, # 1 of 2 duplicates
+ {"prompt": short_prompt, "completion": short_completion},
+ {"prompt": long_prompt, "completion": long_completion}, # 2 of 2 duplicates
+
+ ]
+
+ with NamedTemporaryFile(suffix="jsonl", mode="w") as training_data:
+ for prompt_completion_row in unprepared_training_data:
+ training_data.write(json.dumps(prompt_completion_row) + "\n")
+ training_data.flush()
+
+ prepared_data_cmd_output = subprocess.run(
+ [f"openai tools fine_tunes.prepare_data -f {training_data.name}"],
+ stdout=subprocess.PIPE,
+ text=True,
+ input="y\ny\ny\ny\ny", # apply all recommendations, one at a time
+ stderr=subprocess.PIPE,
+ encoding="utf-8",
+ shell=True
+ )
+
+ # validate data was prepared successfully
+ assert prepared_data_cmd_output.stderr == ""
+ # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
+ assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout
+
+ return prepared_data_cmd_output.stdout
\ No newline at end of file
openai/validators.py
@@ -158,17 +158,24 @@ def long_examples_validator(df):
ft_type = infer_task_type(df)
if ft_type != "open-ended generation":
- long_examples = df.apply(
- lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
- )
- long_indexes = df.reset_index().index[long_examples].tolist()
+ def get_long_indexes(d):
+ long_examples = d.apply(
+ lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
+ )
+ return d.reset_index().index[long_examples].tolist()
+
+ long_indexes = get_long_indexes(df)
if len(long_indexes) > 0:
immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
optional_msg = f"Remove {len(long_indexes)} long examples"
def optional_fn(x):
- return x.drop(long_indexes)
+
+ long_indexes_to_drop = get_long_indexes(x)
+ if long_indexes != long_indexes_to_drop:
+ sys.stdout.write(f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n")
+ return x.drop(long_indexes_to_drop)
return Remediation(
name="long_examples",