Commit ede08829
Changed files (8)
openai/api_resources/embedding.py
@@ -1,11 +1,10 @@
import base64
import time
-import numpy as np
from openai import util
-from openai.api_resources.abstract import DeletableAPIResource, ListableAPIResource
from openai.api_resources.abstract.engine_api_resource import EngineAPIResource
+from openai.datalib import numpy as np, assert_has_numpy
from openai.error import TryAgain
@@ -40,6 +39,7 @@ class Embedding(EngineAPIResource):
# If an engine isn't using this optimization, don't do anything
if type(data["embedding"]) == str:
+ assert_has_numpy()
data["embedding"] = np.frombuffer(
base64.b64decode(data["embedding"]), dtype="float32"
).tolist()
openai/tests/test_long_examples_validator.py
@@ -2,9 +2,14 @@ import json
import subprocess
from tempfile import NamedTemporaryFile
+import pytest
+
+from openai.datalib import HAS_PANDAS, HAS_NUMPY, NUMPY_INSTRUCTIONS, PANDAS_INSTRUCTIONS
-def test_long_examples_validator() -> None:
+@pytest.mark.skipif(not HAS_PANDAS, reason=PANDAS_INSTRUCTIONS)
+@pytest.mark.skipif(not HAS_NUMPY, reason=NUMPY_INSTRUCTIONS)
+def test_long_examples_validator() -> None:
"""
Ensures that long_examples_validator() handles previously applied recommendations,
namely dropped duplicates, without resulting in a KeyError.
@@ -43,5 +48,5 @@ def test_long_examples_validator() -> None:
assert prepared_data_cmd_output.stderr == ""
# validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout
-
+
return prepared_data_cmd_output.stdout
openai/datalib.py
@@ -0,0 +1,56 @@
+"""
+This module helps make data libraries like `numpy` and `pandas` optional dependencies.
+
+The libraries add up to 130MB+, which makes it challenging to deploy applications
+using this library in environments with code size constraints, like AWS Lambda.
+
+This module serves as an import proxy and provides a few utilities for dealing with the optionality.
+
+Since the primary use case of this library (talking to the OpenAI API) doesn’t generally require data libraries,
+it’s safe to make them optional. The rare case when data libraries are needed in the client is handled through
+assertions with instructive error messages.
+
+See also `setup.py`.
+
+"""
+try:
+ import numpy
+except ImportError:
+ numpy = None
+
+try:
+ import pandas
+except ImportError:
+ pandas = None
+
+HAS_NUMPY = bool(numpy)
+HAS_PANDAS = bool(pandas)
+
+INSTRUCTIONS = """
+
+OpenAI error:
+
+ missing `{library}`
+
+This feature requires additional dependencies:
+
+ $ pip install openai[datalib]
+
+"""
+
+NUMPY_INSTRUCTIONS = INSTRUCTIONS.format(library="numpy")
+PANDAS_INSTRUCTIONS = INSTRUCTIONS.format(library="pandas")
+
+
+class MissingDependencyError(Exception):
+ pass
+
+
+def assert_has_numpy():
+ if not HAS_NUMPY:
+ raise MissingDependencyError(NUMPY_INSTRUCTIONS)
+
+
+def assert_has_pandas():
+ if not HAS_PANDAS:
+ raise MissingDependencyError(PANDAS_INSTRUCTIONS)
openai/embeddings_utils.py
@@ -2,8 +2,6 @@ import textwrap as tr
from typing import List, Optional
import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
import plotly.express as px
from scipy import spatial
from sklearn.decomposition import PCA
@@ -12,6 +10,8 @@ from sklearn.metrics import average_precision_score, precision_recall_curve
from tenacity import retry, stop_after_attempt, wait_random_exponential
import openai
+from openai.datalib import numpy as np
+from openai.datalib import pandas as pd
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
openai/validators.py
@@ -2,7 +2,7 @@ import os
import sys
from typing import Any, Callable, NamedTuple, Optional
-import pandas as pd
+from openai.datalib import pandas as pd, assert_has_pandas
class Remediation(NamedTuple):
@@ -474,6 +474,7 @@ def read_any_format(fname, fields=["prompt", "completion"]):
- for .xlsx it will read the first sheet
- for .txt it will assume completions and split on newline
"""
+ assert_has_pandas()
remediation = None
necessary_msg = None
immediate_msg = None
openai/wandb_logger.py
@@ -13,10 +13,9 @@ if WANDB_AVAILABLE:
import re
from pathlib import Path
- import numpy as np
- import pandas as pd
-
from openai import File, FineTune
+ from openai.datalib import numpy as np
+ from openai.datalib import pandas as pd
class WandbLogger:
README.md
@@ -25,6 +25,26 @@ Install from source with:
python setup.py install
```
+### Optional dependencies
+
+Install dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py):
+
+```sh
+pip install openai[embeddings]
+```
+
+Install support for [Weights & Biases](https://wandb.me/openai-docs):
+
+```
+pip install openai[wandb]
+```
+
+Data libraries like `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with:
+
+```sh
+pip install openai[datalib]
+````
+
## Usage
The library needs to be configured with your account's secret key which is available on the [website](https://beta.openai.com/account/api-keys). Either set it as the `OPENAI_API_KEY` environment variable before using the library:
setup.py
@@ -12,6 +12,15 @@ with open(version_path, "rt") as f:
with open("README.md", "r") as fh:
long_description = fh.read()
+
+DATA_LIBRARIES = [
+ # These libraries are optional because of their size. See `openai/datalib.py`.
+ "numpy",
+ "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool
+ "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy
+ "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format
+]
+
setup(
name="openai",
description="Python client library for the OpenAI API",
@@ -21,22 +30,23 @@ setup(
install_requires=[
"requests>=2.20", # to get the patch for CVE-2018-18074
"tqdm", # Needed for progress bars
- "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool
- "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy
- "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format
- "numpy",
'typing_extensions;python_version<"3.8"', # Needed for type hints for mypy
"aiohttp", # Needed for async support
],
extras_require={
"dev": ["black~=21.6b0", "pytest==6.*", "pytest-asyncio", "pytest-mock"],
- "wandb": ["wandb"],
+ "datalib": DATA_LIBRARIES,
+ "wandb": [
+ "wandb",
+ *DATA_LIBRARIES,
+ ],
"embeddings": [
"scikit-learn>=1.0.2", # Needed for embedding utils, versions >= 1.1 require python 3.8
"tenacity>=8.0.1",
"matplotlib",
"sklearn",
"plotly",
+ *DATA_LIBRARIES,
],
},
python_requires=">=3.7.1",