Commit f4f6f90c

Rachel Lim <rachel@openai.com>
2021-12-04 08:31:43
[fine-tuning] accept file URLs as train & validation files (#50) tag: v0.11.2
also a few fixes: setting the correct filename for file uploads using files.create, reinstating the progress meter for uploading files in conjunction with the fine-tuning endpoint, standardizing punctuation on FT help strings
1 parent 1f32472
Changed files (3)
openai
openai/api_resources/file.py
@@ -20,6 +20,7 @@ class File(ListableAPIResource, DeletableAPIResource):
         api_base=None,
         api_version=None,
         organization=None,
+        user_provided_filename=None,
     ):
         if purpose != "search" and model is not None:
             raise ValueError("'model' is only meaningful if 'purpose' is 'search'")
@@ -32,9 +33,13 @@ class File(ListableAPIResource, DeletableAPIResource):
         url = cls.class_url()
         # Set the filename on 'purpose' and 'model' to None so they are
         # interpreted as form data.
-        files = [("file", file), ("purpose", (None, purpose))]
+        files = [("purpose", (None, purpose))]
         if model is not None:
             files.append(("model", (None, model)))
+        if user_provided_filename is not None:
+            files.append(("file", (user_provided_filename, file)))
+        else:
+            files.append(("file", file))
         response, _, api_key = requestor.request("post", url, files=files)
         return util.convert_to_openai_object(
             response, api_key, api_version, organization
@@ -65,21 +70,15 @@ class File(ListableAPIResource, DeletableAPIResource):
     @classmethod
     def find_matching_files(
         cls,
+        name,
+        bytes,
+        purpose,
         api_key=None,
         api_base=None,
         api_version=None,
         organization=None,
-        file=None,
-        purpose=None,
     ):
-        if file is None:
-            raise openai.error.InvalidRequestError(
-                "'file' is a required property", "file"
-            )
-        if purpose is None:
-            raise openai.error.InvalidRequestError(
-                "'purpose' is a required property", "purpose"
-            )
+        """Find already uploaded files with the same name, size, and purpose."""
         all_files = cls.list(
             api_key=api_key,
             api_base=api_base or openai.api_base,
@@ -87,15 +86,14 @@ class File(ListableAPIResource, DeletableAPIResource):
             organization=organization,
         ).get("data", [])
         matching_files = []
+        basename = os.path.basename(name)
         for f in all_files:
             if f["purpose"] != purpose:
                 continue
-            if not hasattr(file, "name") or f["filename"] != file.name:
+            file_basename = os.path.basename(f["filename"])
+            if file_basename != basename:
                 continue
-            file.seek(0, os.SEEK_END)
-            if f["bytes"] != file.tell():
-                file.seek(0)
+            if f["bytes"] != bytes:
                 continue
-            file.seek(0)
             matching_files.append(f)
         return matching_files
openai/cli.py
@@ -4,6 +4,8 @@ import signal
 import sys
 import warnings
 
+import requests
+
 import openai
 from openai.upload_progress import BufferReader
 from openai.validators import (
@@ -200,7 +202,10 @@ class File:
         with open(args.file, "rb") as file_reader:
             buffer_reader = BufferReader(file_reader.read(), desc="Upload progress")
         resp = openai.File.create(
-            file=buffer_reader, purpose=args.purpose, model=args.model
+            file=buffer_reader,
+            purpose=args.purpose,
+            model=args.model,
+            user_provided_filename=args.file,
         )
         print(resp)
 
@@ -238,52 +243,102 @@ class FineTune:
         print(resp)
 
     @classmethod
-    def _get_or_upload(cls, file, check_if_file_exists=True):
-        try:
-            openai.File.retrieve(file)
-        except openai.error.InvalidRequestError as e:
-            if e.http_status == 404 and os.path.isfile(file):
-                matching_files = openai.File.find_matching_files(
-                    file=open(file), purpose="fine-tune"
+    def _is_url(cls, file: str):
+        return file.lower().startswith("http")
+
+    @classmethod
+    def _download_file_from_public_url(cls, url: str) -> Optional[bytes]:
+        resp = requests.get(url)
+        if resp.status_code == 200:
+            return resp.content
+        else:
+            return None
+
+    @classmethod
+    def _maybe_upload_file(
+        cls,
+        file: Optional[str] = None,
+        content: Optional[bytes] = None,
+        user_provided_file: Optional[str] = None,
+        check_if_file_exists: bool = True,
+    ):
+        # Exactly one of `file` or `content` must be provided
+        if (file is None) == (content is None):
+            raise ValueError("Exactly one of `file` or `content` must be provided")
+
+        if content is None:
+            assert file is not None
+            with open(file, "rb") as f:
+                content = f.read()
+
+        if check_if_file_exists:
+            bytes = len(content)
+            matching_files = openai.File.find_matching_files(
+                name=user_provided_file or f.name, bytes=bytes, purpose="fine-tune"
+            )
+            if len(matching_files) > 0:
+                file_ids = [f["id"] for f in matching_files]
+                sys.stdout.write(
+                    "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format(
+                        name=os.path.basename(matching_files[0]["filename"]),
+                        size=matching_files[0]["bytes"],
+                    )
                 )
-                if len(matching_files) > 0 and check_if_file_exists:
-                    file_ids = [f["id"] for f in matching_files]
+                sys.stdout.write("\n".join(file_ids))
+                while True:
                     sys.stdout.write(
-                        "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format(
-                            name=matching_files[0]["filename"],
-                            size=matching_files[0]["bytes"],
-                        )
+                        "\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
                     )
-                    sys.stdout.write("\n".join(file_ids))
-                    while True:
+                    inp = sys.stdin.readline().strip()
+                    if inp in file_ids:
                         sys.stdout.write(
-                            "\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
+                            "Reusing already uploaded file: {id}\n".format(id=inp)
                         )
-                        inp = sys.stdin.readline().strip()
-                        if inp in file_ids:
-                            sys.stdout.write(
-                                "Using your file {file}: {id}\n".format(
-                                    file=file, id=inp
-                                )
-                            )
-                            return inp
-                        elif inp == "":
-                            break
-                        else:
-                            sys.stdout.write(
-                                "File id '{id}' is not among the IDs of the potentially duplicated files\n".format(
-                                    id=inp
-                                )
+                        return inp
+                    elif inp == "":
+                        break
+                    else:
+                        sys.stdout.write(
+                            "File id '{id}' is not among the IDs of the potentially duplicated files\n".format(
+                                id=inp
                             )
+                        )
 
-                resp = openai.File.create(
-                    file=open(file),
-                    purpose="fine-tune",
-                )
-                sys.stdout.write(
-                    "Uploaded file from {file}: {id}\n".format(file=file, id=resp["id"])
+        buffer_reader = BufferReader(content, desc="Upload progress")
+        resp = openai.File.create(
+            file=buffer_reader,
+            purpose="fine-tune",
+            user_provided_filename=user_provided_file or file,
+        )
+        sys.stdout.write(
+            "Uploaded file from {file}: {id}\n".format(
+                file=user_provided_file or file, id=resp["id"]
+            )
+        )
+        return resp["id"]
+
+    @classmethod
+    def _get_or_upload(cls, file, check_if_file_exists=True):
+        try:
+            # 1. If it's a valid file, use it
+            openai.File.retrieve(file)
+            return file
+        except openai.error.InvalidRequestError:
+            pass
+        if os.path.isfile(file):
+            # 2. If it's a file on the filesystem, upload it
+            return cls._maybe_upload_file(
+                file=file, check_if_file_exists=check_if_file_exists
+            )
+        if cls._is_url(file):
+            # 3. If it's a URL, download it temporarily
+            content = cls._download_file_from_public_url(file)
+            if content is not None:
+                return cls._maybe_upload_file(
+                    content=content,
+                    check_if_file_exists=check_if_file_exists,
+                    user_provided_file=file,
                 )
-                return resp["id"]
         return file
 
     @classmethod
@@ -737,15 +792,15 @@ Mutually exclusive with `top_p`.""",
         "--training_file",
         required=True,
         help="JSONL file containing prompt-completion examples for training. This can "
-        "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
-        "or a local file path.",
+        "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
+        'a local file path, or a URL that starts with "http".',
     )
     sub.add_argument(
         "-v",
         "--validation_file",
         help="JSONL file containing prompt-completion examples for validation. This can "
-        "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
-        "or a local file path.",
+        "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
+        'a local file path, or a URL that starts with "http".',
     )
     sub.add_argument(
         "--no_check_if_files_exist",
@@ -780,7 +835,7 @@ Mutually exclusive with `top_p`.""",
         type=float,
         help="The learning rate multiplier to use for training. The fine-tuning "
         "learning rate is determined by the original learning rate used for "
-        "pretraining multiplied by this value",
+        "pretraining multiplied by this value.",
     )
     sub.add_argument(
         "--use_packing",
@@ -796,7 +851,7 @@ Mutually exclusive with `top_p`.""",
         "--no_packing",
         action="store_false",
         dest="use_packing",
-        help="Disables the packing flag (see --use_packing for description)",
+        help="Disables the packing flag (see --use_packing for description).",
     )
     sub.set_defaults(use_packing=None)
     sub.add_argument(
@@ -804,7 +859,7 @@ Mutually exclusive with `top_p`.""",
         type=float,
         help="The weight to use for the prompt loss. The optimum value here depends "
         "depends on your use case. This determines how much the model prioritizes "
-        "learning from prompt tokens vs learning from completion tokens",
+        "learning from prompt tokens vs learning from completion tokens.",
     )
     sub.add_argument(
         "--compute_classification_metrics",
@@ -817,13 +872,13 @@ Mutually exclusive with `top_p`.""",
         "--classification_n_classes",
         type=int,
         help="The number of classes in a classification task. This parameter is "
-        "required for multiclass classification",
+        "required for multiclass classification.",
     )
     sub.add_argument(
         "--classification_positive_class",
         help="The positive class in binary classification. This parameter is needed "
         "to generate precision, recall and F-1 metrics when doing binary "
-        "classification",
+        "classification.",
     )
     sub.add_argument(
         "--classification_betas",
openai/version.py
@@ -1,1 +1,1 @@
-VERSION = "0.11.1"
+VERSION = "0.11.2"