Commit 63490159

joe-at-openai <117690718+joe-at-openai@users.noreply.github.com>
2023-01-22 09:53:02
Improve error message in case of a misformatted file (#158) (#190) tag: v0.26.2
* add more descriptive error handling regarding poorly formatted files * update version * add dot prefix to json file extentions and ensure list of allowable file types is complete * cleanup error messages and add comments to explain jsonl/json loading logic * cleanup csv/tsv reading allowing use of elif for other file extensions, add comments, and remove unnecessary re-attempt to parse as json * run fillna immediately upon DataFrame creation so that an additional switch is not needed * use only 1 try-except block to catch parsing errors + cleanup error message * separate the json and jsonl cases while still maintaining the same functionality, also include a message to user if jsonl appears to be json or vice versa * fix bug in csv path * use index -1 to get extension from split * black formatting apply * fix black Co-authored-by: joe-at-openai <joe@openai.com> Co-authored-by: Boris Power <81998504+BorisPower@users.noreply.github.com>
1 parent 48b6929
Changed files (2)
openai/validators.py
@@ -37,7 +37,7 @@ def necessary_column_validator(df, necessary_column):
     """
 
     def lower_case_column(df, column):
-        cols = [c for c in df.columns if c.lower() == column]
+        cols = [c for c in df.columns if str(c).lower() == column]
         df.rename(columns={cols[0]: column.lower()}, inplace=True)
         return df
 
@@ -47,7 +47,7 @@ def necessary_column_validator(df, necessary_column):
     error_msg = None
 
     if necessary_column not in df.columns:
-        if necessary_column in [c.lower() for c in df.columns]:
+        if necessary_column in [str(c).lower() for c in df.columns]:
 
             def lower_case_column_creator(df):
                 return lower_case_column(df, necessary_column)
@@ -482,51 +482,65 @@ def read_any_format(fname, fields=["prompt", "completion"]):
     df = None
 
     if os.path.isfile(fname):
-        for ending, separator in [(".csv", ","), (".tsv", "\t")]:
-            if fname.lower().endswith(ending):
-                immediate_msg = f"\n- Based on your file extension, your file is formatted as a {ending[1:].upper()} file"
-                necessary_msg = (
-                    f"Your format `{ending[1:].upper()}` will be converted to `JSONL`"
+        try:
+            if fname.lower().endswith(".csv") or fname.lower().endswith(".tsv"):
+                file_extension_str, separator = (
+                    ("CSV", ",") if fname.lower().endswith(".csv") else ("TSV", "\t")
                 )
-                df = pd.read_csv(fname, sep=separator, dtype=str)
-        if fname.lower().endswith(".xlsx"):
-            immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file"
-            necessary_msg = "Your format `XLSX` will be converted to `JSONL`"
-            xls = pd.ExcelFile(fname)
-            sheets = xls.sheet_names
-            if len(sheets) > 1:
-                immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..."
-            df = pd.read_excel(fname, dtype=str)
-        if fname.lower().endswith(".txt"):
-            immediate_msg = "\n- Based on your file extension, you provided a text file"
-            necessary_msg = "Your format `TXT` will be converted to `JSONL`"
-            with open(fname, "r") as f:
-                content = f.read()
-                df = pd.DataFrame(
-                    [["", line] for line in content.split("\n")],
-                    columns=fields,
-                    dtype=str,
+                immediate_msg = f"\n- Based on your file extension, your file is formatted as a {file_extension_str} file"
+                necessary_msg = (
+                    f"Your format `{file_extension_str}` will be converted to `JSONL`"
                 )
-        if fname.lower().endswith("jsonl") or fname.lower().endswith("json"):
-            try:
-                df = pd.read_json(fname, lines=True, dtype=str)
-            except (ValueError, TypeError):
-                df = pd.read_json(fname, dtype=str)
-                immediate_msg = "\n- Your file appears to be in a .JSON format. Your file will be converted to JSONL format"
-                necessary_msg = "Your format `JSON` will be converted to `JSONL`"
-
-        if df is None:
-            error_msg = (
-                "Your file is not saved as a .CSV, .TSV, .XLSX, .TXT or .JSONL file."
-            )
-            if "." in fname:
-                error_msg += (
-                    f" Your file `{fname}` appears to end with `.{fname.split('.')[1]}`"
+                df = pd.read_csv(fname, sep=separator, dtype=str).fillna("")
+            elif fname.lower().endswith(".xlsx"):
+                immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file"
+                necessary_msg = "Your format `XLSX` will be converted to `JSONL`"
+                xls = pd.ExcelFile(fname)
+                sheets = xls.sheet_names
+                if len(sheets) > 1:
+                    immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..."
+                df = pd.read_excel(fname, dtype=str).fillna("")
+            elif fname.lower().endswith(".txt"):
+                immediate_msg = (
+                    "\n- Based on your file extension, you provided a text file"
                 )
+                necessary_msg = "Your format `TXT` will be converted to `JSONL`"
+                with open(fname, "r") as f:
+                    content = f.read()
+                    df = pd.DataFrame(
+                        [["", line] for line in content.split("\n")],
+                        columns=fields,
+                        dtype=str,
+                    ).fillna("")
+            elif fname.lower().endswith(".jsonl"):
+                df = pd.read_json(fname, lines=True, dtype=str).fillna("")
+                if len(df) == 1:
+                    # this is NOT what we expect for a .jsonl file
+                    immediate_msg = "\n- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL format"
+                    necessary_msg = "Your format `JSON` will be converted to `JSONL`"
+                    df = pd.read_json(fname, dtype=str).fillna("")
+                else:
+                    pass  # this is what we expect for a .jsonl file
+            elif fname.lower().endswith(".json"):
+                df = pd.read_json(fname, lines=True, dtype=str).fillna("")
+                if len(df) == 1:
+                    # this is what we expect for a .json file
+                    df = pd.read_json(fname, dtype=str).fillna("")
+                else:
+                    # this is NOT what we expect for a .json file
+                    immediate_msg = "\n- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL format"
+                    necessary_msg = "Your format `JSON` will be converted to `JSONL`"
             else:
-                error_msg += f" Your file `{fname}` does not appear to have a file ending. Please ensure your filename ends with one of the supported file endings."
-        else:
-            df.fillna("", inplace=True)
+                error_msg = "Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONL"
+                if "." in fname:
+                    error_msg += f" Your file `{fname}` ends with the extension `.{fname.split('.')[-1]}` which is not supported."
+                else:
+                    error_msg += f" Your file `{fname}` is missing a file extension."
+
+        except (ValueError, TypeError):
+            file_extension_str = fname.split(".")[-1].upper()
+            error_msg = f"Your file `{fname}` does not appear to be in valid {file_extension_str} format. Please ensure your file is formatted as a valid {file_extension_str} file."
+
     else:
         error_msg = f"File {fname} does not exist."
 
openai/version.py
@@ -1,1 +1,1 @@
-VERSION = "0.26.1"
+VERSION = "0.26.2"
\ No newline at end of file