diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index fba1d41e30..7aa4ed4b5a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1008,10 +1008,12 @@ def _check_file_size(self, filepath: str): blob = bucket.blob(blob_name) blob.reload() file_size = blob.size - else: # local file path + elif os.path.exists(filepath): # local file path file_size = os.path.getsize(filepath) + else: + file_size = None - if file_size > max_size: + if file_size is not None and file_size > max_size: # Convert to GB file_size = round(file_size / (1024**3), 1) max_size = int(max_size / 1024**3) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index edfd57b965..924fddce12 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -18,6 +18,7 @@ import dataclasses import datetime import itertools +import os import typing from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union @@ -421,11 +422,16 @@ def _read_bigquery_load_job( load_job = self._bqclient.load_table_from_uri( filepath_or_buffer, table, job_config=job_config ) - else: + elif os.path.exists(filepath_or_buffer): # local file path with open(filepath_or_buffer, "rb") as source_file: load_job = self._bqclient.load_table_from_file( source_file, table, job_config=job_config ) + else: + raise NotImplementedError( + f"BigQuery engine only supports a local file path or GCS path. " + f"{constants.FEEDBACK_LINK}" + ) else: load_job = self._bqclient.load_table_from_file( filepath_or_buffer, table, job_config=job_config diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 5b5db74ea6..ed3e38e6f8 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1036,6 +1036,25 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine): assert len(df.columns) == 1 +@pytest.mark.parametrize( + "engine", + [ + pytest.param( + "bigquery", + id="bq_engine", + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + pytest.param(None, id="default_engine"), + ], +) +def test_read_csv_others(session, engine): + uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main/tests/data/people.csv" + df = session.read_csv(uri, engine=engine) + assert len(df.columns) == 3 + + @pytest.mark.parametrize( "engine", [ diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 248cf8e0fe..35b2a1982a 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -51,8 +51,7 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ): - """Loads DataFrame from comma-separated values (csv) file locally or from - Cloud Storage. + """Loads data from a comma-separated values (csv) file into a DataFrame. The CSV file data will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. @@ -60,7 +59,8 @@ def read_csv( .. note:: using `engine="bigquery"` will not guarantee the same ordering as the file. Instead, set a serialized index column as the index and sort by - that in the resulting DataFrame. + that in the resulting DataFrame. Only files stored on your local machine + or in Google Cloud Storage are supported. .. note:: For non-bigquery engine, data is inlined in the query SQL if it is