|
22 | 22 | import queue |
23 | 23 | import warnings |
24 | 24 |
|
| 25 | +from packaging import version |
| 26 | + |
| 27 | +from google.cloud.bigquery import _helpers |
| 28 | +from google.cloud.bigquery import schema |
| 29 | + |
25 | 30 | try: |
26 | 31 | import pandas # type: ignore |
27 | 32 |
|
|
43 | 48 | db_dtypes_import_exception = exc |
44 | 49 | date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype |
45 | 50 |
|
46 | | - |
47 | | -import pyarrow # type: ignore |
48 | | -import pyarrow.parquet # type: ignore |
| 51 | +pyarrow = _helpers.PYARROW_VERSIONS.try_import() |
49 | 52 |
|
50 | 53 | try: |
51 | 54 | # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` |
@@ -77,10 +80,6 @@ def _to_wkb(v): |
77 | 80 | # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too. |
78 | 81 | _ARROW_COMPRESSION_SUPPORT = True |
79 | 82 |
|
80 | | -from google.cloud.bigquery import _helpers |
81 | | -from google.cloud.bigquery import schema |
82 | | - |
83 | | - |
84 | 83 | _LOGGER = logging.getLogger(__name__) |
85 | 84 |
|
86 | 85 | _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. |
@@ -141,52 +140,65 @@ def pyarrow_timestamp(): |
141 | 140 | return pyarrow.timestamp("us", tz="UTC") |
142 | 141 |
|
143 | 142 |
|
144 | | -# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
145 | | -# When modifying it be sure to update it there as well. |
146 | | -BQ_TO_ARROW_SCALARS = { |
147 | | - "BIGNUMERIC": pyarrow_bignumeric, |
148 | | - "BOOL": pyarrow.bool_, |
149 | | - "BOOLEAN": pyarrow.bool_, |
150 | | - "BYTES": pyarrow.binary, |
151 | | - "DATE": pyarrow.date32, |
152 | | - "DATETIME": pyarrow_datetime, |
153 | | - "FLOAT": pyarrow.float64, |
154 | | - "FLOAT64": pyarrow.float64, |
155 | | - "GEOGRAPHY": pyarrow.string, |
156 | | - "INT64": pyarrow.int64, |
157 | | - "INTEGER": pyarrow.int64, |
158 | | - "NUMERIC": pyarrow_numeric, |
159 | | - "STRING": pyarrow.string, |
160 | | - "TIME": pyarrow_time, |
161 | | - "TIMESTAMP": pyarrow_timestamp, |
162 | | -} |
163 | | -ARROW_SCALAR_IDS_TO_BQ = { |
164 | | - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
165 | | - pyarrow.bool_().id: "BOOL", |
166 | | - pyarrow.int8().id: "INT64", |
167 | | - pyarrow.int16().id: "INT64", |
168 | | - pyarrow.int32().id: "INT64", |
169 | | - pyarrow.int64().id: "INT64", |
170 | | - pyarrow.uint8().id: "INT64", |
171 | | - pyarrow.uint16().id: "INT64", |
172 | | - pyarrow.uint32().id: "INT64", |
173 | | - pyarrow.uint64().id: "INT64", |
174 | | - pyarrow.float16().id: "FLOAT64", |
175 | | - pyarrow.float32().id: "FLOAT64", |
176 | | - pyarrow.float64().id: "FLOAT64", |
177 | | - pyarrow.time32("ms").id: "TIME", |
178 | | - pyarrow.time64("ns").id: "TIME", |
179 | | - pyarrow.timestamp("ns").id: "TIMESTAMP", |
180 | | - pyarrow.date32().id: "DATE", |
181 | | - pyarrow.date64().id: "DATETIME", # because millisecond resolution |
182 | | - pyarrow.binary().id: "BYTES", |
183 | | - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
184 | | - # The exact scale and precision don't matter, see below. |
185 | | - pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
186 | | - # The exact decimal's scale and precision are not important, as only |
187 | | - # the type ID matters, and it's the same for all decimal256 instances. |
188 | | - pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", |
189 | | -} |
| 143 | +if pyarrow: |
| 144 | + # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
| 145 | + # When modifying it be sure to update it there as well. |
| 146 | + BQ_TO_ARROW_SCALARS = { |
| 147 | + "BOOL": pyarrow.bool_, |
| 148 | + "BOOLEAN": pyarrow.bool_, |
| 149 | + "BYTES": pyarrow.binary, |
| 150 | + "DATE": pyarrow.date32, |
| 151 | + "DATETIME": pyarrow_datetime, |
| 152 | + "FLOAT": pyarrow.float64, |
| 153 | + "FLOAT64": pyarrow.float64, |
| 154 | + "GEOGRAPHY": pyarrow.string, |
| 155 | + "INT64": pyarrow.int64, |
| 156 | + "INTEGER": pyarrow.int64, |
| 157 | + "NUMERIC": pyarrow_numeric, |
| 158 | + "STRING": pyarrow.string, |
| 159 | + "TIME": pyarrow_time, |
| 160 | + "TIMESTAMP": pyarrow_timestamp, |
| 161 | + } |
| 162 | + ARROW_SCALAR_IDS_TO_BQ = { |
| 163 | + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
| 164 | + pyarrow.bool_().id: "BOOL", |
| 165 | + pyarrow.int8().id: "INT64", |
| 166 | + pyarrow.int16().id: "INT64", |
| 167 | + pyarrow.int32().id: "INT64", |
| 168 | + pyarrow.int64().id: "INT64", |
| 169 | + pyarrow.uint8().id: "INT64", |
| 170 | + pyarrow.uint16().id: "INT64", |
| 171 | + pyarrow.uint32().id: "INT64", |
| 172 | + pyarrow.uint64().id: "INT64", |
| 173 | + pyarrow.float16().id: "FLOAT64", |
| 174 | + pyarrow.float32().id: "FLOAT64", |
| 175 | + pyarrow.float64().id: "FLOAT64", |
| 176 | + pyarrow.time32("ms").id: "TIME", |
| 177 | + pyarrow.time64("ns").id: "TIME", |
| 178 | + pyarrow.timestamp("ns").id: "TIMESTAMP", |
| 179 | + pyarrow.date32().id: "DATE", |
| 180 | + pyarrow.date64().id: "DATETIME", # because millisecond resolution |
| 181 | + pyarrow.binary().id: "BYTES", |
| 182 | + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
| 183 | + # The exact scale and precision don't matter, see below. |
| 184 | + pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
| 185 | + } |
| 186 | + |
| 187 | + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): |
| 188 | + BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric |
| 189 | + # The exact decimal's scale and precision are not important, as only |
| 190 | + # the type ID matters, and it's the same for all decimal256 instances. |
| 191 | + ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" |
| 192 | + _BIGNUMERIC_SUPPORT = True |
| 193 | + else: |
| 194 | + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER |
| 195 | + |
| 196 | +else: # pragma: NO COVER |
| 197 | + BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER |
| 198 | + ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER |
| 199 | + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER |
| 200 | + |
| 201 | + |
190 | 202 | BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { |
191 | 203 | "GEOGRAPHY": { |
192 | 204 | b"ARROW:extension:name": b"google:sqlType:geography", |
@@ -480,6 +492,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema): |
480 | 492 | # If schema detection was not successful for all columns, also try with |
481 | 493 | # pyarrow, if available. |
482 | 494 | if unknown_type_fields: |
| 495 | + if not pyarrow: |
| 496 | + msg = "Could not determine the type of columns: {}".format( |
| 497 | + ", ".join(field.name for field in unknown_type_fields) |
| 498 | + ) |
| 499 | + warnings.warn(msg) |
| 500 | + return None # We cannot detect the schema in full. |
| 501 | + |
483 | 502 | # The augment_schema() helper itself will also issue unknown type |
484 | 503 | # warnings if detection still fails for any of the fields. |
485 | 504 | bq_schema_out = augment_schema(dataframe, bq_schema_out) |
@@ -654,6 +673,8 @@ def dataframe_to_parquet( |
654 | 673 |
|
655 | 674 | This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. |
656 | 675 | """ |
| 676 | + pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) |
| 677 | + |
657 | 678 | import pyarrow.parquet # type: ignore |
658 | 679 |
|
659 | 680 | kwargs = ( |
|
0 commit comments