test: retry getting rows after streaming them in test_insert_rows_from_dataframe (#832)

tswast · web-flow · commit d8c25ac139d5 · 2021-07-29T10:59:03.000+02:00
diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py
@@ -21,6 +21,7 @@
 import io
 import operator
 
+import google.api_core.retry
 import pkg_resources
 import pytest
 import pytz
@@ -41,6 +42,10 @@
 PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0")
 
 
+class MissingDataError(Exception):
+    pass
+
+
 def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_id):
     """Test that a DataFrame with dtypes that map well to BigQuery types
     can be uploaded without specifying a schema.
@@ -666,27 +671,34 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
     )
     for errors in chunk_errors:
         assert not errors
-
-    # Use query to fetch rows instead of listing directly from the table so
-    # that we get values from the streaming buffer.
-    rows = list(
-        bigquery_client.query(
-            "SELECT * FROM `{}.{}.{}`".format(
-                table.project, table.dataset_id, table.table_id
-            )
-        )
-    )
-
-    sorted_rows = sorted(rows, key=operator.attrgetter("int_col"))
-    row_tuples = [r.values() for r in sorted_rows]
     expected = [
         # Pandas often represents NULL values as NaN. Convert to None for
         # easier comparison.
         tuple(None if col != col else col for col in data_row)
         for data_row in dataframe.itertuples(index=False)
     ]
 
-    assert len(row_tuples) == len(expected)
+    # Use query to fetch rows instead of listing directly from the table so
+    # that we get values from the streaming buffer "within a few seconds".
+    # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability
+    @google.api_core.retry.Retry(
+        predicate=google.api_core.retry.if_exception_type(MissingDataError)
+    )
+    def get_rows():
+        rows = list(
+            bigquery_client.query(
+                "SELECT * FROM `{}.{}.{}`".format(
+                    table.project, table.dataset_id, table.table_id
+                )
+            )
+        )
+        if len(rows) != len(expected):
+            raise MissingDataError()
+        return rows
+
+    rows = get_rows()
+    sorted_rows = sorted(rows, key=operator.attrgetter("int_col"))
+    row_tuples = [r.values() for r in sorted_rows]
 
     for row, expected_row in zip(row_tuples, expected):
         assert (