Skip to content

Commit 13bd849

Browse files
authored
feat(bigquery): add timeout parameter to QueryJob.done() method (#9875)
* feat(bigquery): add timeout to QueryJob.done() * Add tests for methods that got timeout param In addition, fix the timeout logic in QueryJob.done() - the timeouts are in different units (seconds vs. milliseconds) * Fix lint warning (unused variable) * Adjust timeout exception type in QueryJob.result() * Update dependency pins The new timeout feature requires more recent versions of the API core and google auth dependencies. * Add safety margin on top of server-side timeout If the server-side processing timeout is used (the `timeout_ms` API parameter) as the total timeout, it should be slightly longer than the actual server-side timeout in order to not timeout the connection while there might still be chance that the server-side processing has actually completed.
1 parent 2dabc2d commit 13bd849

6 files changed

Lines changed: 202 additions & 19 deletions

File tree

bigquery/google/cloud/bigquery/client.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,7 +1081,7 @@ def delete_table(self, table, retry=DEFAULT_RETRY, not_found_ok=False):
10811081
raise
10821082

10831083
def _get_query_results(
1084-
self, job_id, retry, project=None, timeout_ms=None, location=None
1084+
self, job_id, retry, project=None, timeout_ms=None, location=None, timeout=None,
10851085
):
10861086
"""Get the query results object for a query job.
10871087
@@ -1096,6 +1096,9 @@ def _get_query_results(
10961096
(Optional) number of milliseconds the the API call should
10971097
wait for the query to complete before the request times out.
10981098
location (str): Location of the query job.
1099+
timeout (Optional[float]):
1100+
The number of seconds to wait for the underlying HTTP transport
1101+
before retrying the HTTP request.
10991102
11001103
Returns:
11011104
google.cloud.bigquery.query._QueryResults:
@@ -1122,7 +1125,7 @@ def _get_query_results(
11221125
# job is complete (from QueryJob.done(), called ultimately from
11231126
# QueryJob.result()). So we don't need to poll here.
11241127
resource = self._call_api(
1125-
retry, method="GET", path=path, query_params=extra_params
1128+
retry, method="GET", path=path, query_params=extra_params, timeout=timeout
11261129
)
11271130
return _QueryResults.from_api_repr(resource)
11281131

bigquery/google/cloud/bigquery/job.py

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,14 @@
1414

1515
"""Define API Jobs."""
1616

17+
from __future__ import division
18+
19+
import concurrent.futures
1720
import copy
1821
import re
1922
import threading
2023

24+
import requests
2125
import six
2226
from six.moves import http_client
2327

@@ -50,6 +54,7 @@
5054
_DONE_STATE = "DONE"
5155
_STOPPED_REASON = "stopped"
5256
_TIMEOUT_BUFFER_SECS = 0.1
57+
_SERVER_TIMEOUT_MARGIN_SECS = 1.0
5358
_CONTAINS_ORDER_BY = re.compile(r"ORDER\s+BY", re.IGNORECASE)
5459

5560
_ERROR_REASON_TO_EXCEPTION = {
@@ -663,7 +668,7 @@ def exists(self, client=None, retry=DEFAULT_RETRY):
663668
else:
664669
return True
665670

666-
def reload(self, client=None, retry=DEFAULT_RETRY):
671+
def reload(self, client=None, retry=DEFAULT_RETRY, timeout=None):
667672
"""API call: refresh job properties via a GET request.
668673
669674
See
@@ -675,6 +680,9 @@ def reload(self, client=None, retry=DEFAULT_RETRY):
675680
``client`` stored on the current dataset.
676681
677682
retry (google.api_core.retry.Retry): (Optional) How to retry the RPC.
683+
timeout (Optional[float]):
684+
The number of seconds to wait for the underlying HTTP transport
685+
before retrying the HTTP request.
678686
"""
679687
client = self._require_client(client)
680688

@@ -683,7 +691,11 @@ def reload(self, client=None, retry=DEFAULT_RETRY):
683691
extra_params["location"] = self.location
684692

685693
api_response = client._call_api(
686-
retry, method="GET", path=self.path, query_params=extra_params
694+
retry,
695+
method="GET",
696+
path=self.path,
697+
query_params=extra_params,
698+
timeout=timeout,
687699
)
688700
self._set_properties(api_response)
689701

@@ -2994,9 +3006,16 @@ def estimated_bytes_processed(self):
29943006
result = int(result)
29953007
return result
29963008

2997-
def done(self, retry=DEFAULT_RETRY):
3009+
def done(self, retry=DEFAULT_RETRY, timeout=None):
29983010
"""Refresh the job and checks if it is complete.
29993011
3012+
Args:
3013+
retry (Optional[google.api_core.retry.Retry]):
3014+
How to retry the call that retrieves query results.
3015+
timeout (Optional[float]):
3016+
The number of seconds to wait for the underlying HTTP transport
3017+
before retrying the HTTP request.
3018+
30003019
Returns:
30013020
bool: True if the job is complete, False otherwise.
30023021
"""
@@ -3007,11 +3026,25 @@ def done(self, retry=DEFAULT_RETRY):
30073026
timeout_ms = None
30083027
if self._done_timeout is not None:
30093028
# Subtract a buffer for context switching, network latency, etc.
3010-
timeout = self._done_timeout - _TIMEOUT_BUFFER_SECS
3011-
timeout = max(min(timeout, 10), 0)
3012-
self._done_timeout -= timeout
3029+
api_timeout = self._done_timeout - _TIMEOUT_BUFFER_SECS
3030+
api_timeout = max(min(api_timeout, 10), 0)
3031+
self._done_timeout -= api_timeout
30133032
self._done_timeout = max(0, self._done_timeout)
3014-
timeout_ms = int(timeout * 1000)
3033+
timeout_ms = int(api_timeout * 1000)
3034+
3035+
# If the server-side processing timeout (timeout_ms) is specified and
3036+
# would be picked as the total request timeout, we want to add a small
3037+
# margin to it - we don't want to timeout the connection just as the
3038+
# server-side processing might have completed, but instead slightly
3039+
# after the server-side deadline.
3040+
# However, if `timeout` is specified, and is shorter than the adjusted
3041+
# server timeout, the former prevails.
3042+
if timeout_ms is not None and timeout_ms > 0:
3043+
server_timeout_with_margin = timeout_ms / 1000 + _SERVER_TIMEOUT_MARGIN_SECS
3044+
if timeout is not None:
3045+
timeout = min(server_timeout_with_margin, timeout)
3046+
else:
3047+
timeout = server_timeout_with_margin
30153048

30163049
# Do not refresh is the state is already done, as the job will not
30173050
# change once complete.
@@ -3022,13 +3055,14 @@ def done(self, retry=DEFAULT_RETRY):
30223055
project=self.project,
30233056
timeout_ms=timeout_ms,
30243057
location=self.location,
3058+
timeout=timeout,
30253059
)
30263060

30273061
# Only reload the job once we know the query is complete.
30283062
# This will ensure that fields such as the destination table are
30293063
# correctly populated.
30303064
if self._query_results.complete:
3031-
self.reload(retry=retry)
3065+
self.reload(retry=retry, timeout=timeout)
30323066

30333067
return self.state == _DONE_STATE
30343068

@@ -3132,6 +3166,8 @@ def result(
31323166
exc.message += self._format_for_exception(self.query, self.job_id)
31333167
exc.query_job = self
31343168
raise
3169+
except requests.exceptions.Timeout as exc:
3170+
six.raise_from(concurrent.futures.TimeoutError, exc)
31353171

31363172
# If the query job is complete but there are no query results, this was
31373173
# special job, such as a DDL query. Return an empty result set to

bigquery/setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
release_status = "Development Status :: 5 - Production/Stable"
3131
dependencies = [
3232
'enum34; python_version < "3.4"',
33+
"google-auth >= 1.9.0, < 2.0dev",
34+
"google-api-core >= 1.15.0, < 2.0dev",
3335
"google-cloud-core >= 1.0.3, < 2.0dev",
3436
"google-resumable-media >= 0.3.1, != 0.4.0, < 0.6.0dev",
3537
"protobuf >= 3.6.0",

bigquery/tests/system.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import uuid
2727
import re
2828

29+
import requests
2930
import six
3031
import psutil
3132
import pytest
@@ -1893,6 +1894,29 @@ def test_query_iter(self):
18931894
row_tuples = [r.values() for r in query_job]
18941895
self.assertEqual(row_tuples, [(1,)])
18951896

1897+
def test_querying_data_w_timeout(self):
1898+
job_config = bigquery.QueryJobConfig()
1899+
job_config.use_query_cache = False
1900+
1901+
query_job = Config.CLIENT.query(
1902+
"""
1903+
SELECT name, SUM(number) AS total_people
1904+
FROM `bigquery-public-data.usa_names.usa_1910_current`
1905+
GROUP BY name
1906+
""",
1907+
location="US",
1908+
job_config=job_config,
1909+
)
1910+
1911+
# Specify a very tight deadline to demonstrate that the timeout
1912+
# actually has effect.
1913+
with self.assertRaises(requests.exceptions.Timeout):
1914+
query_job.done(timeout=0.1)
1915+
1916+
# Now wait for the result using a more realistic deadline.
1917+
query_job.result(timeout=30)
1918+
self.assertTrue(query_job.done(timeout=30))
1919+
18961920
@unittest.skipIf(pandas is None, "Requires `pandas`")
18971921
def test_query_results_to_dataframe(self):
18981922
QUERY = """

bigquery/tests/unit/test_client.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,12 +226,14 @@ def test__get_query_results_miss_w_explicit_project_and_timeout(self):
226226
project="other-project",
227227
location=self.LOCATION,
228228
timeout_ms=500,
229+
timeout=42,
229230
)
230231

231232
conn.api_request.assert_called_once_with(
232233
method="GET",
233234
path="/projects/other-project/queries/nothere",
234235
query_params={"maxResults": 0, "timeoutMs": 500, "location": self.LOCATION},
236+
timeout=42,
235237
)
236238

237239
def test__get_query_results_miss_w_client_location(self):
@@ -248,6 +250,7 @@ def test__get_query_results_miss_w_client_location(self):
248250
method="GET",
249251
path="/projects/PROJECT/queries/nothere",
250252
query_params={"maxResults": 0, "location": self.LOCATION},
253+
timeout=None,
251254
)
252255

253256
def test__get_query_results_hit(self):

0 commit comments

Comments
 (0)