From 90c273008d8a4a3f8d884bc9c38a346be3ffa43a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Apr 2021 15:32:58 -0700 Subject: [PATCH 01/57] dataflow: update Dockerfile and modularize tests --- .../flex-templates/streaming_beam/Dockerfile | 27 +-- .../flex-templates/streaming_beam/e2e_test.py | 112 ++++++++++ .../flex-templates/streaming_beam/noxfile.py | 171 -------------- .../streaming_beam/noxfile_config.py | 40 ++++ .../streaming_beam/requirements-test.txt | 1 - .../streaming_beam/streaming_beam.py | 121 +++++----- .../streaming_beam/streaming_beam_test.py | 165 -------------- dataflow/requirements-test.txt | 4 + dataflow/testing_utils.py | 210 ++++++++++++++++++ 9 files changed, 449 insertions(+), 402 deletions(-) create mode 100644 dataflow/flex-templates/streaming_beam/e2e_test.py delete mode 100644 dataflow/flex-templates/streaming_beam/noxfile.py create mode 100644 dataflow/flex-templates/streaming_beam/noxfile_config.py delete mode 100644 dataflow/flex-templates/streaming_beam/requirements-test.txt delete mode 100644 dataflow/flex-templates/streaming_beam/streaming_beam_test.py create mode 100644 dataflow/requirements-test.txt create mode 100644 dataflow/testing_utils.py diff --git a/dataflow/flex-templates/streaming_beam/Dockerfile b/dataflow/flex-templates/streaming_beam/Dockerfile index 554720eee96..02f346957af 100644 --- a/dataflow/flex-templates/streaming_beam/Dockerfile +++ b/dataflow/flex-templates/streaming_beam/Dockerfile @@ -14,19 +14,20 @@ FROM gcr.io/dataflow-templates-base/python3-template-launcher-base -ARG WORKDIR=/dataflow/template -RUN mkdir -p ${WORKDIR} -WORKDIR ${WORKDIR} +ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="/template/requirements.txt" +ENV FLEX_TEMPLATE_PYTHON_PY_FILE="/template/streaming_beam.py" -# Due to a change in the Beam base image in version 2.24, we need to install -# libffi-dev manually as a dependency. For more information: -# https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4891 -RUN apt-get update && apt-get install -y libffi-dev git && rm -rf /var/lib/apt/lists/* +COPY . /template -COPY requirements.txt . -COPY streaming_beam.py . +# We could get rid of installing libffi-dev and git, or we could leave them. +RUN apt-get update \ + && apt-get install -y libffi-dev git \ + && rm -rf /var/lib/apt/lists/* \ + # Upgrade pip and install the requirements. + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r $FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE \ + # Download the requirements to speed up launching the Dataflow job. + && pip download --no-cache-dir --dest /tmp/dataflow-requirements-cache -r $FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE -ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt" -ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/streaming_beam.py" - -RUN pip install -U -r ./requirements.txt +# Since we already downloaded all the dependencies, there's no need to rebuild everything. +ENV PIP_NO_DEPS=True diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py new file mode 100644 index 00000000000..ded698e5e30 --- /dev/null +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -0,0 +1,112 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import os +import subprocess +import time +import uuid + +from google.cloud import bigquery +import pytest + +from . import testing_utils + + +SUFFIX = uuid.uuid4().hex[0:6] +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +BUCKET_NAME = f"flex-templates-streaming-beam-{SUFFIX}" +BIGQUERY_DATASET = f"flex_templates_{SUFFIX}" +BIGQUERY_TABLE = "streaming_beam" +TOPIC = f"flex-templates-streaming-beam-{SUFFIX}" +SUBSCRIPTION = TOPIC +IMAGE_NAME = f"gcr.io/{PROJECT}/dataflow/flex-templates/streaming-beam-{SUFFIX}:latest" +TEMPLATE_FILE = "template.json" +REGION = "us-central1" + + +@pytest.fixture(scope="session") +def bucket_name() -> str: + return testing_utils.storage_bucket(BUCKET_NAME) + + +@pytest.fixture(scope="session") +def topic_path() -> str: + return testing_utils.pubsub_topic(PROJECT, TOPIC) + + +@pytest.fixture(scope="session") +def subscription_path(topic_path: str) -> str: + return testing_utils.pubsub_subscription(PROJECT, topic_path, SUBSCRIPTION) + + +@pytest.fixture(scope="session") +def bigquery_dataset() -> str: + return testing_utils.bigquery_dataset(PROJECT, BIGQUERY_DATASET) + + +@pytest.fixture(scope="session") +def publisher(topic_path: str) -> bool: + return testing_utils.pubsub_publisher(topic_path) + + +@pytest.fixture(scope="session") +def template_image() -> str: + return testing_utils.container_image(PROJECT, IMAGE_NAME) + + +@pytest.fixture(scope="session") +def template_path(bucket_name: str, template_image: str) -> str: + return testing_utils.dataflow_flex_template_build( + bucket_name=bucket_name, + template_file=TEMPLATE_FILE, + template_image=template_image, + metadata_file="metadata.json", + ) + + +def test_run_template( + publisher: str, + bucket_name: str, + template_path: str, + dataset: str, + subscription_path: str, +) -> None: + + job_name = f"flex-templates-streaming-beam-{SUFFIX}" + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "run", + job_name, + f"--template-file-gcs-location={template_path}", + f"--temp_location=gs://{bucket_name}/temp", + f"--parameters=input_subscription={subscription_path}", + f"--parameters=output_table={dataset}.{BIGQUERY_TABLE}", + f"--region={REGION}", + ], + check=True, + ) + + # Wait for 10 minutes, and then cancel the job. + time.sleep(10 * 60) + testing_utils.dataflow_jobs_cancel(PROJECT, job_name) + + # Check for output data in BigQuery. + bigquery_client = bigquery.Client() + query = f"SELECT * FROM {PROJECT}.{BIGQUERY_DATASET}.{BIGQUERY_TABLE}" + query_job = bigquery_client.query(query) + rows = query_job.result() + assert rows.total_rows > 0 + for row in rows: + assert row["score"] == 1 diff --git a/dataflow/flex-templates/streaming_beam/noxfile.py b/dataflow/flex-templates/streaming_beam/noxfile.py deleted file mode 100644 index c917ea77ced..00000000000 --- a/dataflow/flex-templates/streaming_beam/noxfile.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -from pathlib import Path - -import nox - - -# DO NOT EDIT - automatically generated. -# All versions used to tested samples. -ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8", "3.9"] - -# Any default versions that should be ignored. -IGNORED_VERSIONS = ["2.7", "3.8", "3.9"] - -TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) - -# -# Style Checks -# - - -def _determine_local_import_names(start_dir): - """Determines all import names that should be considered "local". - - This is used when running the linter to insure that import order is - properly checked. - """ - file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] - return [ - basename - for basename, extension in file_ext_pairs - if extension == ".py" - or os.path.isdir(os.path.join(start_dir, basename)) - and basename not in ("__pycache__") - ] - - -# Linting with flake8. -# -# We ignore the following rules: -# E203: whitespace before ‘:’ -# E266: too many leading ‘#’ for block comment -# E501: line too long -# I202: Additional newline in a section of imports -# -# We also need to specify the rules which are ignored by default: -# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] -FLAKE8_COMMON_ARGS = [ - "--show-source", - "--builtin=gettext", - "--max-complexity=20", - "--import-order-style=google", - "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", - "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", - "--max-line-length=88", -] - - -@nox.session -def lint(session): - session.install("flake8", "flake8-import-order") - - local_names = _determine_local_import_names(".") - args = FLAKE8_COMMON_ARGS + [ - "--application-import-names", - ",".join(local_names), - ".", - ] - session.run("flake8", *args) - - -# -# Black -# - -@nox.session -def blacken(session): - session.install("black") - python_files = [path for path in os.listdir(".") if path.endswith(".py")] - - session.run("black", *python_files) - - -# -# Sample Tests -# - - -PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] - - -def _session_tests(session, post_install=None): - """Runs py.test for a particular project.""" - if os.path.exists("requirements.txt"): - session.install("-r", "requirements.txt") - - if os.path.exists("requirements-test.txt"): - session.install("-r", "requirements-test.txt") - - if post_install: - post_install(session) - - session.run( - "pytest", - *(PYTEST_COMMON_ARGS + session.posargs), - # Pytest will return 5 when no tests are collected. This can happen - # on travis where slow and flaky tests are excluded. - # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html - success_codes=[0, 5] - ) - - -@nox.session(python=ALL_VERSIONS) -def py(session): - """Runs py.test for a sample using the specified version of Python.""" - if session.python in TESTED_VERSIONS: - _session_tests(session) - else: - print("SKIPPED: {} tests are disabled for this sample.".format(session.python)) - - -# -# Readmegen -# - - -def _get_repo_root(): - """ Returns the root folder of the project. """ - # Get root of this repository. Assume we don't have directories nested deeper than 10 items. - p = Path(os.getcwd()) - for i in range(10): - if p is None: - break - if Path(p / ".git").exists(): - return str(p) - p = p.parent - raise Exception("Unable to detect repository root.") - - -GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) - - -@nox.session -@nox.parametrize("path", GENERATED_READMES) -def readmegen(session, path): - """(Re-)generates the readme for a sample.""" - session.install("jinja2", "pyyaml") - dir_ = os.path.dirname(path) - - if os.path.exists(os.path.join(dir_, "requirements.txt")): - session.install("-r", os.path.join(dir_, "requirements.txt")) - - in_file = os.path.join(dir_, "README.rst.in") - session.run( - "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file - ) diff --git a/dataflow/flex-templates/streaming_beam/noxfile_config.py b/dataflow/flex-templates/streaming_beam/noxfile_config.py new file mode 100644 index 00000000000..b6ba946dcbc --- /dev/null +++ b/dataflow/flex-templates/streaming_beam/noxfile_config.py @@ -0,0 +1,40 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be inported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": True, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": { + "PYTEST_ADDOPTS": "-n=8", # parallelize tests in multiple CPUs + }, +} diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt deleted file mode 100644 index d5bd56fd179..00000000000 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==6.2.1 diff --git a/dataflow/flex-templates/streaming_beam/streaming_beam.py b/dataflow/flex-templates/streaming_beam/streaming_beam.py index af1321e8e18..ab2ecfc9087 100644 --- a/dataflow/flex-templates/streaming_beam/streaming_beam.py +++ b/dataflow/flex-templates/streaming_beam/streaming_beam.py @@ -24,82 +24,99 @@ import json import logging import time +from typing import Any, Dict, List import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions import apache_beam.transforms.window as window # Defines the BigQuery schema for the output table. -SCHEMA = ','.join([ - 'url:STRING', - 'num_reviews:INTEGER', - 'score:FLOAT64', - 'first_date:TIMESTAMP', - 'last_date:TIMESTAMP', -]) - - -def parse_json_message(message): +SCHEMA = ",".join( + [ + "url:STRING", + "num_reviews:INTEGER", + "score:FLOAT64", + "first_date:TIMESTAMP", + "last_date:TIMESTAMP", + ] +) + + +def parse_json_message(message: str) -> Dict[str, Any]: """Parse the input json message and add 'score' & 'processing_time' keys.""" row = json.loads(message) return { - 'url': row['url'], - 'score': 1.0 if row['review'] == 'positive' else 0.0, - 'processing_time': int(time.time()), - } - - -def get_statistics(url_messages): - """Get statistics from the input URL messages.""" - url, messages = url_messages - return { - 'url': url, - 'num_reviews': len(messages), - 'score': sum(msg['score'] for msg in messages) / len(messages), - 'first_date': min(msg['processing_time'] for msg in messages), - 'last_date': max(msg['processing_time'] for msg in messages), + "url": row["url"], + "score": 1.0 if row["review"] == "positive" else 0.0, + "processing_time": int(time.time()), } -def run(args, input_subscription, output_table, window_interval): +def run( + input_subscription: str, + output_table: str, + window_interval_sec: int = 60, + beam_args: List[str] = None, +) -> None: """Build and run the pipeline.""" - options = PipelineOptions(args, save_main_session=True, streaming=True) + options = PipelineOptions(beam_args, save_main_session=True, streaming=True) with beam.Pipeline(options=options) as pipeline: - - # Read the messages from PubSub and process them. messages = ( pipeline - | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub( - subscription=input_subscription).with_output_types(bytes) - | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8')) - | 'Parse JSON messages' >> beam.Map(parse_json_message) - | 'Fixed-size windows' >> beam.WindowInto( - window.FixedWindows(int(window_interval), 0)) - | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg)) - | 'Group by URLs' >> beam.GroupByKey() - | 'Get statistics' >> beam.Map(get_statistics)) + | "Read from Pub/Sub" + >> beam.io.ReadFromPubSub( + subscription=input_subscription + ).with_output_types(bytes) + | "UTF-8 bytes to string" >> beam.Map(lambda msg: msg.decode("utf-8")) + | "Parse JSON messages" >> beam.Map(parse_json_message) + | "Fixed-size windows" + >> beam.WindowInto(window.FixedWindows(window_interval_sec, 0)) + | "Add URL keys" >> beam.WithKeys(lambda msg: msg["url"]) + | "Group by URLs" >> beam.GroupByKey() + | "Get statistics" + >> beam.MapTuple( + lambda url, messages: { + "url": url, + "num_reviews": len(messages), + "score": sum(msg["score"] for msg in messages) / len(messages), + "first_date": min(msg["processing_time"] for msg in messages), + "last_date": max(msg["processing_time"] for msg in messages), + } + ) + ) # Output the results into BigQuery table. - _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery( - output_table, schema=SCHEMA) + _ = messages | "Write to Big Query" >> beam.io.WriteToBigQuery( + output_table, schema=SCHEMA + ) -if __name__ == '__main__': +if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) + parser = argparse.ArgumentParser() parser.add_argument( - '--output_table', - help='Output BigQuery table for results specified as: ' - 'PROJECT:DATASET.TABLE or DATASET.TABLE.') + "--output_table", + help="Output BigQuery table for results specified as: " + "PROJECT:DATASET.TABLE or DATASET.TABLE.", + ) parser.add_argument( - '--input_subscription', - help='Input PubSub subscription of the form ' - '"projects//subscriptions/."') + "--input_subscription", + help="Input PubSub subscription of the form " + '"projects//subscriptions/."', + ) parser.add_argument( - '--window_interval', + "--window_interval_sec", default=60, - help='Window interval in seconds for grouping incoming messages.') - known_args, pipeline_args = parser.parse_known_args() - run(pipeline_args, known_args.input_subscription, known_args.output_table, - known_args.window_interval) + type=int, + help="Window interval in seconds for grouping incoming messages.", + ) + args, beam_args = parser.parse_known_args() + + run( + input_subscription=args.input_subscription, + output_table=args.output_table, + window_interval_sec=args.window_interval_sec, + beam_args=beam_args, + ) diff --git a/dataflow/flex-templates/streaming_beam/streaming_beam_test.py b/dataflow/flex-templates/streaming_beam/streaming_beam_test.py deleted file mode 100644 index d588adf432f..00000000000 --- a/dataflow/flex-templates/streaming_beam/streaming_beam_test.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -import multiprocessing as mp -import os -import subprocess as sp -import tempfile -import time -import uuid - -from google.cloud import bigquery -from google.cloud import pubsub -import pytest - - -PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -UUID = str(uuid.uuid4()).split('-')[0] -DATASET = 'beam_samples_{}'.format(UUID) -TABLE = 'streaming_beam_sql' -TOPIC = 'messages-{}'.format(UUID) -SUBSCRIPTION = TOPIC - - -@pytest.fixture -def topic_path(): - publisher_client = pubsub.PublisherClient() - topic_path = publisher_client.topic_path(PROJECT, TOPIC) - try: - publisher_client.delete_topic(topic_path) - except Exception: - pass - topic = publisher_client.create_topic(topic_path) - yield topic.name - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - sp.check_call( - ['gcloud', 'pubsub', '--project', PROJECT, 'topics', 'delete', TOPIC]) - - -@pytest.fixture -def subscription_path(topic_path): - subscriber = pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(PROJECT, SUBSCRIPTION) - try: - subscriber.delete_subscription(subscription_path) - except Exception: - pass - subscription = subscriber.create_subscription(subscription_path, topic_path) - yield subscription.name - - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - sp.check_call( - ['gcloud', 'pubsub', '--project', PROJECT, 'subscriptions', 'delete', - SUBSCRIPTION]) - - -@pytest.fixture -def dataset(): - bigquery_client = bigquery.Client(project=PROJECT) - dataset_id = '{}.{}'.format(PROJECT, DATASET) - dataset = bigquery.Dataset(dataset_id) - dataset = bigquery_client.create_dataset(dataset, exists_ok=True) - yield '{}:{}'.format(PROJECT, DATASET) - bigquery_client.delete_table('{}.{}'.format(DATASET, TABLE), not_found_ok=True) - bigquery_client.delete_dataset(DATASET, not_found_ok=True) - - -def _infinite_publish_job(topic_path): - publisher_client = pubsub.PublisherClient() - while True: - future = publisher_client.publish( - topic_path, - b'{"url": "https://beam.apache.org/", "review": "positive"}') - future.result() - time.sleep(1) - - -def test_dataflow_flex_templates_pubsub_to_bigquery(dataset, topic_path, - subscription_path): - # Use one process to publish messages to a topic. - publish_process = mp.Process(target=lambda: _infinite_publish_job(topic_path)) - - # Use another process to run the streaming pipeline that should write one - # row to BigQuery every minute (according to the default window size). - pipeline_process = mp.Process(target=lambda: sp.call([ - 'python', 'streaming_beam.py', - '--project', PROJECT, - '--runner', 'DirectRunner', - '--temp_location', tempfile.mkdtemp(), - '--input_subscription', subscription_path, - '--output_table', '{}.{}'.format(dataset, TABLE), - '--window_interval', '5', - ])) - - publish_process.start() - pipeline_process.start() - - pipeline_process.join(timeout=30) - publish_process.join(timeout=0) - - pipeline_process.terminate() - publish_process.terminate() - - # Check for output data in BigQuery. - bigquery_client = bigquery.Client(project=PROJECT) - query = 'SELECT * FROM {}.{}'.format(DATASET, TABLE) - query_job = bigquery_client.query(query) - rows = query_job.result() - assert rows.total_rows > 0 - for row in rows: - assert row['score'] == 1 - - -# TODO:Testcase using Teststream currently does not work as intended. -# The first write to BigQuery fails. Have filed a bug. The test case -# to be changed once the bug gets fixed. b/152446921 -''' -@mock.patch("apache_beam.Pipeline", TestPipeline) -@mock.patch( - "apache_beam.io.ReadFromPubSub", - lambda subscription: ( - TestStream() - .advance_watermark_to(0) - .advance_processing_time(60) - .add_elements([TimestampedValue( - b'{"url": "https://beam.apache.org/", "review": "positive"}', - 1575937195)]) - .advance_processing_time(60) - .add_elements([TimestampedValue( - b'{"url": "https://beam.apache.org/", "review": "positive"}', - 1575937255)]) - .advance_watermark_to_infinity() - ), -) -def test_dataflow_flex_templates_pubsub_to_bigquery(dataset): - streaming_beam.run( - args=[ - "--project", PROJECT, - "--runner", "DirectRunner" - ], - input_subscription="unused", - output_table='{}:{}.{}'.format(PROJECT, DATASET, TABLE), - ) - - # Check for output data in BigQuery. - bigquery_client = bigquery.Client(project=PROJECT) - query = 'SELECT * FROM {}.{}'.format(DATASET, TABLE) - query_job = bigquery_client.query(query) - rows = query_job.result() - assert rows.total_rows > 0 -''' diff --git a/dataflow/requirements-test.txt b/dataflow/requirements-test.txt new file mode 100644 index 00000000000..16def8b7d52 --- /dev/null +++ b/dataflow/requirements-test.txt @@ -0,0 +1,4 @@ +backoff==1.10.0 +google-api-python-client==2.1.0 +pytest-xdist==2.2.1 +pytest==6.2.1 diff --git a/dataflow/testing_utils.py b/dataflow/testing_utils.py new file mode 100644 index 00000000000..f0cedcec36e --- /dev/null +++ b/dataflow/testing_utils.py @@ -0,0 +1,210 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from datetime import time +import itertools +import subprocess +import multiprocessing as mp +from typing import Callable, List, Optional + +import backoff +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from google.cloud import bigquery +from google.cloud import pubsub +from google.cloud import storage + +dataflow = build("dataflow", "v1b3") + + +RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds + + +def storage_bucket(bucket_name: str) -> str: + storage_client = storage.Client() + bucket = storage_client.create_bucket(bucket_name) + + print(f"storage_bucket: {repr(bucket_name)}") + yield bucket_name + + bucket.delete(force=True) + + +def bigquery_dataset(project: str, dataset_name: str) -> str: + bigquery_client = bigquery.Client() + dataset = bigquery.Dataset(f"{project}.{dataset_name}") + dataset = bigquery_client.create_dataset(dataset) + + print(f"bigquery_dataset: {dataset.full_dataset_id}") + yield dataset.full_dataset_id + + bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) + + +def pubsub_topic(project: str, topic_name: str) -> str: + publisher_client = pubsub.PublisherClient() + topic_path = publisher_client.topic_path(project, topic_name) + topic = publisher_client.create_topic(topic_path) + + print(f"pubsub_topic: {repr(topic.name)}") + yield topic.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], + check=True, + ) + + +def pubsub_subscription(project: str, topic_path: str, subscription_name: str) -> str: + subscriber = pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_name) + subscription = subscriber.create_subscription(subscription_path, topic_path) + + print(f"pubsub_subscription: {repr(subscription.name)}") + yield subscription.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + [ + "gcloud", + "pubsub", + "--project", + project, + "subscriptions", + "delete", + subscription_name, + ], + check=True, + ) + + +def pubsub_publisher(topic_path: str, new_msg: Callable[[int], str]) -> bool: + def _infinite_publish_job() -> None: + publisher_client = pubsub.PublisherClient() + for i in itertools.count(): + publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result() + time.sleep(1) + + # Start a subprocess in the background to do the publishing. + p = mp.Process(target=_infinite_publish_job) + p.start() + + yield p.is_alive() + + # For cleanup, terminate the background process. + p.join(timeout=0) + p.terminate() + + +def container_image(project: str, image_name: str) -> str: + subprocess.run(["gcloud", "auth", "configure-docker"], check=True) + subprocess.run( + [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag={image_name}", + ".", + ], + check=True, + ) + + yield image_name + + subprocess.run( + [ + "gcloud", + "container", + "images", + "delete", + image_name, + f"--project={project}", + "--quiet", + ], + check=True, + ) + + +def dataflow_job_id_from_job_name(project: str, job_name: str) -> Optional[str]: + # Only return the 50 most recent results - our job is likely to be in here. + # If the job is not found, first try increasing this number. + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + jobs_request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + filter="ACTIVE", + pageSize=50, + ) + ) + response = jobs_request.execute() + + # Search for the job in the list that has our name (names are unique) + for job in response["jobs"]: + if job["name"] == job_name: + return job["id"] + return None + + +@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) +def dataflow_jobs_cancel(project: str, job_name: str) -> None: + # To cancel a dataflow job, we need its ID, not its name + job_id = dataflow_job_id_from_job_name(project, job_name) + + if job_id is not None: + # Cancel the Dataflow job if it exists. + # If it doesn't, job_id will be equal to None. + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + request = ( + dataflow.projects() + .jobs() + .update( + projectId=project, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) + ) + request.execute() + + +def dataflow_flex_template_build( + bucket_name: str, template_file: str, template_image: str, metadata_file: str +) -> str: + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "build", + f"gs://{bucket_name}/{template_file}", + f"--image={template_image}", + "--sdk-language=PYTHON", + f"--metadata-file={metadata_file}", + ], + check=True, + ) + + yield f"gs://{bucket_name}/{template_file}" + + storage_client = storage.Client() + storage_client.bucket(bucket_name).blob(template_file).delete() From bfdd061a36e0ce77a69d88998aba05db3151e091 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Apr 2021 15:36:31 -0700 Subject: [PATCH 02/57] update headers --- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- dataflow/flex-templates/streaming_beam/noxfile_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index ded698e5e30..1b78020fc69 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. diff --git a/dataflow/flex-templates/streaming_beam/noxfile_config.py b/dataflow/flex-templates/streaming_beam/noxfile_config.py index b6ba946dcbc..79bccdd3e16 100644 --- a/dataflow/flex-templates/streaming_beam/noxfile_config.py +++ b/dataflow/flex-templates/streaming_beam/noxfile_config.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 32e50ed2174be7b9c4e76dd8ce0d5845208833dc Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Apr 2021 15:42:22 -0700 Subject: [PATCH 03/57] add requirements-test.txt --- .../{ => flex-templates/streaming_beam}/requirements-test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dataflow/{ => flex-templates/streaming_beam}/requirements-test.txt (100%) diff --git a/dataflow/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt similarity index 100% rename from dataflow/requirements-test.txt rename to dataflow/flex-templates/streaming_beam/requirements-test.txt From 2dc5ba12d3ecfdb710535b0e1b2bb26cc90b0571 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Apr 2021 15:57:21 -0700 Subject: [PATCH 04/57] enable relative import --- dataflow/flex-templates/__init__.py | 13 +++++++++++++ dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 dataflow/flex-templates/__init__.py diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py new file mode 100644 index 00000000000..e7ed7703169 --- /dev/null +++ b/dataflow/flex-templates/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from .. import testing_utils diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 1b78020fc69..d043e52c0b8 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -18,7 +18,7 @@ from google.cloud import bigquery import pytest -from . import testing_utils +from .. import testing_utils SUFFIX = uuid.uuid4().hex[0:6] From 5c7a885d8814cbc0260df33126632e4f090427cd Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Apr 2021 16:04:39 -0700 Subject: [PATCH 05/57] add __init__.py --- dataflow/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 dataflow/__init__.py diff --git a/dataflow/__init__.py b/dataflow/__init__.py new file mode 100644 index 00000000000..ffc78f34e19 --- /dev/null +++ b/dataflow/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. From f6027ffeeb2235716894336c0d98ce13a6c52e19 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 14 Apr 2021 13:22:18 -0700 Subject: [PATCH 06/57] add __init__.py --- dataflow/flex-templates/streaming_beam/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 dataflow/flex-templates/streaming_beam/__init__.py diff --git a/dataflow/flex-templates/streaming_beam/__init__.py b/dataflow/flex-templates/streaming_beam/__init__.py new file mode 100644 index 00000000000..ffc78f34e19 --- /dev/null +++ b/dataflow/flex-templates/streaming_beam/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. From 2c6c53ac450e28bb66b06881046171755b536f5a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 13:36:50 -0700 Subject: [PATCH 07/57] modularized tests with conftest --- dataflow/conftest.py | 292 ++++++++++++++++++ .../flex-templates/streaming_beam/e2e_test.py | 105 +++---- dataflow/testing_utils.py | 210 ------------- 3 files changed, 338 insertions(+), 269 deletions(-) create mode 100644 dataflow/conftest.py delete mode 100644 dataflow/testing_utils.py diff --git a/dataflow/conftest.py b/dataflow/conftest.py new file mode 100644 index 00000000000..7995f922e1b --- /dev/null +++ b/dataflow/conftest.py @@ -0,0 +1,292 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from dataclasses import dataclass +from datetime import time +import itertools +import json +import subprocess +import multiprocessing as mp +import os +from typing import Callable, Dict, Optional +import uuid + +import backoff +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from google.cloud import bigquery +from google.cloud.bigquery.table import RowIterator +from google.cloud import pubsub +from google.cloud import storage +import pytest + +dataflow = build("dataflow", "v1b3") + +# Default options. +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +REGION: str = "us-west1" +ZONE: str = "us-west1-b" + +RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds + + +@dataclass +class Utils: + uuid: str = uuid.uuid4().hex[0:6] + project: str = PROJECT + region: str = REGION + zone: str = ZONE + + @staticmethod + def storage_bucket(bucket_name: str) -> str: + storage_client = storage.Client() + bucket_unique_name = f"{bucket_name}-{Utils.uuid}" + bucket = storage_client.create_bucket(bucket_unique_name) + + print(f"storage_bucket: {repr(bucket_unique_name)}") + yield bucket_unique_name + + bucket.delete(force=True) + + @staticmethod + def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: + bigquery_client = bigquery.Client() + dataset = bigquery_client.create_dataset( + bigquery.Dataset(f"{project}.{dataset_name}_{Utils.uuid}") + ) + + print(f"bigquery_dataset: {dataset.full_dataset_id}") + yield dataset.full_dataset_id + + bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) + + @staticmethod + def bigquery_query(query: str) -> RowIterator: + bigquery_client = bigquery.Client() + query_job = bigquery_client.query(query) + return query_job.result() + + @staticmethod + def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: + publisher_client = pubsub.PublisherClient() + topic_path = publisher_client.topic_path(project, f"{topic_name}-{Utils.uuid}") + topic = publisher_client.create_topic(topic_path) + + print(f"pubsub_topic: {repr(topic.name)}") + yield topic.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], + check=True, + ) + + @staticmethod + def pubsub_subscription( + topic_path: str, subscription_name: str, project: str = PROJECT + ) -> str: + subscriber = pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path( + project, f"{subscription_name}-{Utils.uuid}" + ) + subscription = subscriber.create_subscription(subscription_path, topic_path) + + print(f"pubsub_subscription: {repr(subscription.name)}") + yield subscription.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + [ + "gcloud", + "pubsub", + "--project", + project, + "subscriptions", + "delete", + subscription_name, + ], + check=True, + ) + + @staticmethod + def pubsub_publisher( + topic_path: str, + new_msg: Callable[[int], str] = lambda i: json.dumps( + {"id": i, "content": f"message {i}"} + ), + ) -> bool: + def _infinite_publish_job() -> None: + publisher_client = pubsub.PublisherClient() + for i in itertools.count(): + publisher_client.publish( + topic_path, new_msg(i).encode("utf-8") + ).result() + time.sleep(1) + + # Start a subprocess in the background to do the publishing. + p = mp.Process(target=_infinite_publish_job) + p.start() + + yield p.is_alive() + + # For cleanup, terminate the background process. + p.join(timeout=0) + p.terminate() + + @staticmethod + def container_image( + image_path: str, + project: str = PROJECT, + tag: str = "latest", + ) -> str: + image_name = f"gcr.io/{project}/{image_path}-{Utils.uuid}:{tag}" + subprocess.run(["gcloud", "auth", "configure-docker"], check=True) + subprocess.run( + [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag={image_name}", + ".", + ], + check=True, + ) + + yield image_name + + subprocess.run( + [ + "gcloud", + "container", + "images", + "delete", + image_name, + f"--project={project}", + "--quiet", + ], + check=True, + ) + + @staticmethod + def dataflow_job_id_from_job_name( + job_name: str, project: str = PROJECT + ) -> Optional[str]: + # Only return the 50 most recent results - our job is likely to be in here. + # If the job is not found, first try increasing this number. + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + jobs_request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + filter="ACTIVE", + pageSize=50, + ) + ) + response = jobs_request.execute() + + # Search for the job in the list that has our name (names are unique) + for job in response["jobs"]: + if job["name"] == job_name: + return job["id"] + return None + + @staticmethod + @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) + def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: + # To cancel a dataflow job, we need its ID, not its name + job_id = Utils.dataflow_job_id_from_job_name(project, job_name) + + if job_id is not None: + # Cancel the Dataflow job if it exists. + # If it doesn't, job_id will be equal to None. + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + request = ( + dataflow.projects() + .jobs() + .update( + projectId=project, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) + ) + request.execute() + + @staticmethod + def dataflow_flex_template_build( + bucket_name: str, + template_image: str, + metadata_file: str, + project: str = PROJECT, + template_file: str = "template.json", + ) -> str: + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "build", + f"gs://{bucket_name}/{template_file}", + f"--project={project}", + f"--image={template_image}", + "--sdk-language=PYTHON", + f"--metadata-file={metadata_file}", + ], + check=True, + ) + + yield f"gs://{bucket_name}/{template_file}" + + storage_client = storage.Client() + storage_client.bucket(bucket_name).blob(template_file).delete() + + @staticmethod + def dataflow_flex_template_run( + job_name: str, + template_path: str, + bucket_name: str, + parameters: Dict[str, str] = {}, + project: str = PROJECT, + region: str = REGION, + ) -> str: + unique_job_name = f"{job_name}-{Utils.uuid}" + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "run", + unique_job_name, + f"--template-file-gcs-location={template_path}", + f"--project={project}", + f"--region={region}", + f"--temp_location=gs://{bucket_name}/temp", + ] + + [f"--parameters={name}={value}" for name, value in parameters.items()], + check=True, + ) + + yield unique_job_name + + +@pytest.fixture(scope="session") +def utils(): + return Utils() diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index d043e52c0b8..1072765c351 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -10,103 +10,90 @@ # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -import os -import subprocess +import json +from conftest import Utils import time -import uuid -from google.cloud import bigquery import pytest -from .. import testing_utils - - -SUFFIX = uuid.uuid4().hex[0:6] -PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -BUCKET_NAME = f"flex-templates-streaming-beam-{SUFFIX}" -BIGQUERY_DATASET = f"flex_templates_{SUFFIX}" -BIGQUERY_TABLE = "streaming_beam" -TOPIC = f"flex-templates-streaming-beam-{SUFFIX}" -SUBSCRIPTION = TOPIC -IMAGE_NAME = f"gcr.io/{PROJECT}/dataflow/flex-templates/streaming-beam-{SUFFIX}:latest" -TEMPLATE_FILE = "template.json" -REGION = "us-central1" - @pytest.fixture(scope="session") -def bucket_name() -> str: - return testing_utils.storage_bucket(BUCKET_NAME) +def bucket_name(utils: Utils) -> str: + return utils.storage_bucket("dataflow-flex-templates-streaming-beam") @pytest.fixture(scope="session") -def topic_path() -> str: - return testing_utils.pubsub_topic(PROJECT, TOPIC) +def pubsub_topic(utils: Utils) -> str: + return utils.pubsub_topic("dataflow-flex-templates-streaming-beam") @pytest.fixture(scope="session") -def subscription_path(topic_path: str) -> str: - return testing_utils.pubsub_subscription(PROJECT, topic_path, SUBSCRIPTION) +def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str: + return utils.pubsub_subscription( + pubsub_topic, "dataflow-flex-templates-streaming-beam" + ) @pytest.fixture(scope="session") -def bigquery_dataset() -> str: - return testing_utils.bigquery_dataset(PROJECT, BIGQUERY_DATASET) +def bigquery_dataset(utils: Utils) -> str: + return utils.bigquery_dataset("dataflow_flex_templates") @pytest.fixture(scope="session") -def publisher(topic_path: str) -> bool: - return testing_utils.pubsub_publisher(topic_path) +def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: + return utils.pubsub_publisher( + pubsub_topic, + new_msg=lambda i: json.dumps( + { + "url": "https://beam.apache.org/", + "review": "positive" if i % 2 == 0 else "negative", + } + ), + ) @pytest.fixture(scope="session") -def template_image() -> str: - return testing_utils.container_image(PROJECT, IMAGE_NAME) +def flex_template_image(utils: Utils) -> str: + return utils.container_image(f"dataflow/flex-templates/streaming-beam") @pytest.fixture(scope="session") -def template_path(bucket_name: str, template_image: str) -> str: - return testing_utils.dataflow_flex_template_build( +def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: + return utils.dataflow_flex_template_build( bucket_name=bucket_name, - template_file=TEMPLATE_FILE, - template_image=template_image, + template_image=flex_template_image, metadata_file="metadata.json", ) def test_run_template( - publisher: str, + utils: Utils, bucket_name: str, - template_path: str, - dataset: str, - subscription_path: str, + pubsub_publisher: str, + pubsub_subscription: str, + flex_template_path: str, + bigquery_dataset: str, ) -> None: - job_name = f"flex-templates-streaming-beam-{SUFFIX}" - subprocess.call( - [ - "gcloud", - "dataflow", - "flex-template", - "run", - job_name, - f"--template-file-gcs-location={template_path}", - f"--temp_location=gs://{bucket_name}/temp", - f"--parameters=input_subscription={subscription_path}", - f"--parameters=output_table={dataset}.{BIGQUERY_TABLE}", - f"--region={REGION}", - ], - check=True, + bigquery_table = "streaming_beam" + job_name = utils.dataflow_flex_template_run( + job_name="flex-templates-streaming-beam", + template_path=flex_template_path, + bucket_name=bucket_name, + parameters={ + "input_subscription": pubsub_subscription, + "output_table": f"{bigquery_dataset}.{bigquery_table}", + }, ) + # Since this is a streaming job, it will never finish running. # Wait for 10 minutes, and then cancel the job. time.sleep(10 * 60) - testing_utils.dataflow_jobs_cancel(PROJECT, job_name) + utils.dataflow_jobs_cancel(job_name) # Check for output data in BigQuery. - bigquery_client = bigquery.Client() - query = f"SELECT * FROM {PROJECT}.{BIGQUERY_DATASET}.{BIGQUERY_TABLE}" - query_job = bigquery_client.query(query) - rows = query_job.result() + query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" + rows = utils.bigquery_query(query) assert rows.total_rows > 0 for row in rows: - assert row["score"] == 1 + assert "score" in row diff --git a/dataflow/testing_utils.py b/dataflow/testing_utils.py deleted file mode 100644 index f0cedcec36e..00000000000 --- a/dataflow/testing_utils.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -from datetime import time -import itertools -import subprocess -import multiprocessing as mp -from typing import Callable, List, Optional - -import backoff -from googleapiclient.discovery import build -from googleapiclient.errors import HttpError -from google.cloud import bigquery -from google.cloud import pubsub -from google.cloud import storage - -dataflow = build("dataflow", "v1b3") - - -RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds - - -def storage_bucket(bucket_name: str) -> str: - storage_client = storage.Client() - bucket = storage_client.create_bucket(bucket_name) - - print(f"storage_bucket: {repr(bucket_name)}") - yield bucket_name - - bucket.delete(force=True) - - -def bigquery_dataset(project: str, dataset_name: str) -> str: - bigquery_client = bigquery.Client() - dataset = bigquery.Dataset(f"{project}.{dataset_name}") - dataset = bigquery_client.create_dataset(dataset) - - print(f"bigquery_dataset: {dataset.full_dataset_id}") - yield dataset.full_dataset_id - - bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) - - -def pubsub_topic(project: str, topic_name: str) -> str: - publisher_client = pubsub.PublisherClient() - topic_path = publisher_client.topic_path(project, topic_name) - topic = publisher_client.create_topic(topic_path) - - print(f"pubsub_topic: {repr(topic.name)}") - yield topic.name - - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( - ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], - check=True, - ) - - -def pubsub_subscription(project: str, topic_path: str, subscription_name: str) -> str: - subscriber = pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_name) - subscription = subscriber.create_subscription(subscription_path, topic_path) - - print(f"pubsub_subscription: {repr(subscription.name)}") - yield subscription.name - - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( - [ - "gcloud", - "pubsub", - "--project", - project, - "subscriptions", - "delete", - subscription_name, - ], - check=True, - ) - - -def pubsub_publisher(topic_path: str, new_msg: Callable[[int], str]) -> bool: - def _infinite_publish_job() -> None: - publisher_client = pubsub.PublisherClient() - for i in itertools.count(): - publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result() - time.sleep(1) - - # Start a subprocess in the background to do the publishing. - p = mp.Process(target=_infinite_publish_job) - p.start() - - yield p.is_alive() - - # For cleanup, terminate the background process. - p.join(timeout=0) - p.terminate() - - -def container_image(project: str, image_name: str) -> str: - subprocess.run(["gcloud", "auth", "configure-docker"], check=True) - subprocess.run( - [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--tag={image_name}", - ".", - ], - check=True, - ) - - yield image_name - - subprocess.run( - [ - "gcloud", - "container", - "images", - "delete", - image_name, - f"--project={project}", - "--quiet", - ], - check=True, - ) - - -def dataflow_job_id_from_job_name(project: str, job_name: str) -> Optional[str]: - # Only return the 50 most recent results - our job is likely to be in here. - # If the job is not found, first try increasing this number. - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list - jobs_request = ( - dataflow.projects() - .jobs() - .list( - projectId=project, - filter="ACTIVE", - pageSize=50, - ) - ) - response = jobs_request.execute() - - # Search for the job in the list that has our name (names are unique) - for job in response["jobs"]: - if job["name"] == job_name: - return job["id"] - return None - - -@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) -def dataflow_jobs_cancel(project: str, job_name: str) -> None: - # To cancel a dataflow job, we need its ID, not its name - job_id = dataflow_job_id_from_job_name(project, job_name) - - if job_id is not None: - # Cancel the Dataflow job if it exists. - # If it doesn't, job_id will be equal to None. - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - body={"requestedState": "JOB_STATE_CANCELLED"}, - ) - ) - request.execute() - - -def dataflow_flex_template_build( - bucket_name: str, template_file: str, template_image: str, metadata_file: str -) -> str: - subprocess.call( - [ - "gcloud", - "dataflow", - "flex-template", - "build", - f"gs://{bucket_name}/{template_file}", - f"--image={template_image}", - "--sdk-language=PYTHON", - f"--metadata-file={metadata_file}", - ], - check=True, - ) - - yield f"gs://{bucket_name}/{template_file}" - - storage_client = storage.Client() - storage_client.bucket(bucket_name).blob(template_file).delete() From 8417d63bed90835182c44e5ba913dab87a11fa86 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 13:48:44 -0700 Subject: [PATCH 08/57] fix lint issues --- dataflow/conftest.py | 489 +++++++++--------- dataflow/flex-templates/__init__.py | 2 - .../flex-templates/streaming_beam/e2e_test.py | 36 +- dataflow/run_template/main_test.py | 111 ++-- 4 files changed, 320 insertions(+), 318 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 7995f922e1b..8bbb4d59f31 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -10,283 +10,268 @@ # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -from dataclasses import dataclass from datetime import time import itertools import json -import subprocess import multiprocessing as mp import os +import subprocess from typing import Callable, Dict, Optional import uuid import backoff -from googleapiclient.discovery import build -from googleapiclient.errors import HttpError from google.cloud import bigquery -from google.cloud.bigquery.table import RowIterator from google.cloud import pubsub from google.cloud import storage -import pytest +from google.cloud.bigquery.table import RowIterator +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError dataflow = build("dataflow", "v1b3") # Default options. +UUID = uuid.uuid4().hex[0:6] PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -REGION: str = "us-west1" -ZONE: str = "us-west1-b" +REGION = "us-west1" +ZONE = "us-west1-b" RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds -@dataclass -class Utils: - uuid: str = uuid.uuid4().hex[0:6] - project: str = PROJECT - region: str = REGION - zone: str = ZONE - - @staticmethod - def storage_bucket(bucket_name: str) -> str: - storage_client = storage.Client() - bucket_unique_name = f"{bucket_name}-{Utils.uuid}" - bucket = storage_client.create_bucket(bucket_unique_name) - - print(f"storage_bucket: {repr(bucket_unique_name)}") - yield bucket_unique_name - - bucket.delete(force=True) - - @staticmethod - def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: - bigquery_client = bigquery.Client() - dataset = bigquery_client.create_dataset( - bigquery.Dataset(f"{project}.{dataset_name}_{Utils.uuid}") - ) - - print(f"bigquery_dataset: {dataset.full_dataset_id}") - yield dataset.full_dataset_id - - bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) - - @staticmethod - def bigquery_query(query: str) -> RowIterator: - bigquery_client = bigquery.Client() - query_job = bigquery_client.query(query) - return query_job.result() - - @staticmethod - def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: +def storage_bucket(bucket_name: str) -> str: + storage_client = storage.Client() + bucket_unique_name = f"{bucket_name}-{UUID}" + bucket = storage_client.create_bucket(bucket_unique_name) + + print(f"storage_bucket: {repr(bucket_unique_name)}") + yield bucket_unique_name + + bucket.delete(force=True) + + +def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: + bigquery_client = bigquery.Client() + dataset = bigquery_client.create_dataset( + bigquery.Dataset(f"{project}.{dataset_name}_{UUID}") + ) + + print(f"bigquery_dataset: {dataset.full_dataset_id}") + yield dataset.full_dataset_id + + bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) + + +def bigquery_query(query: str) -> RowIterator: + bigquery_client = bigquery.Client() + query_job = bigquery_client.query(query) + return query_job.result() + + +def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: + publisher_client = pubsub.PublisherClient() + topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") + topic = publisher_client.create_topic(topic_path) + + print(f"pubsub_topic: {repr(topic.name)}") + yield topic.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], + check=True, + ) + + +def pubsub_subscription( + topic_path: str, subscription_name: str, project: str = PROJECT +) -> str: + subscriber = pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path( + project, f"{subscription_name}-{UUID}" + ) + subscription = subscriber.create_subscription(subscription_path, topic_path) + + print(f"pubsub_subscription: {repr(subscription.name)}") + yield subscription.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + [ + "gcloud", + "pubsub", + "--project", + project, + "subscriptions", + "delete", + subscription_name, + ], + check=True, + ) + + +def pubsub_publisher( + topic_path: str, + new_msg: Callable[[int], str] = lambda i: json.dumps( + {"id": i, "content": f"message {i}"} + ), +) -> bool: + def _infinite_publish_job() -> None: publisher_client = pubsub.PublisherClient() - topic_path = publisher_client.topic_path(project, f"{topic_name}-{Utils.uuid}") - topic = publisher_client.create_topic(topic_path) - - print(f"pubsub_topic: {repr(topic.name)}") - yield topic.name - - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( - ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], - check=True, - ) - - @staticmethod - def pubsub_subscription( - topic_path: str, subscription_name: str, project: str = PROJECT - ) -> str: - subscriber = pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, f"{subscription_name}-{Utils.uuid}" - ) - subscription = subscriber.create_subscription(subscription_path, topic_path) - - print(f"pubsub_subscription: {repr(subscription.name)}") - yield subscription.name - - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( - [ - "gcloud", - "pubsub", - "--project", - project, - "subscriptions", - "delete", - subscription_name, - ], - check=True, - ) - - @staticmethod - def pubsub_publisher( - topic_path: str, - new_msg: Callable[[int], str] = lambda i: json.dumps( - {"id": i, "content": f"message {i}"} - ), - ) -> bool: - def _infinite_publish_job() -> None: - publisher_client = pubsub.PublisherClient() - for i in itertools.count(): - publisher_client.publish( - topic_path, new_msg(i).encode("utf-8") - ).result() - time.sleep(1) - - # Start a subprocess in the background to do the publishing. - p = mp.Process(target=_infinite_publish_job) - p.start() - - yield p.is_alive() - - # For cleanup, terminate the background process. - p.join(timeout=0) - p.terminate() - - @staticmethod - def container_image( - image_path: str, - project: str = PROJECT, - tag: str = "latest", - ) -> str: - image_name = f"gcr.io/{project}/{image_path}-{Utils.uuid}:{tag}" - subprocess.run(["gcloud", "auth", "configure-docker"], check=True) - subprocess.run( - [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--tag={image_name}", - ".", - ], - check=True, - ) - - yield image_name - - subprocess.run( - [ - "gcloud", - "container", - "images", - "delete", - image_name, - f"--project={project}", - "--quiet", - ], - check=True, + for i in itertools.count(): + publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result() + time.sleep(1) + + # Start a subprocess in the background to do the publishing. + p = mp.Process(target=_infinite_publish_job) + p.start() + + yield p.is_alive() + + # For cleanup, terminate the background process. + p.join(timeout=0) + p.terminate() + + +def container_image( + image_path: str, + project: str = PROJECT, + tag: str = "latest", +) -> str: + image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}" + subprocess.run(["gcloud", "auth", "configure-docker"], check=True) + subprocess.run( + [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag={image_name}", + ".", + ], + check=True, + ) + + yield image_name + + subprocess.run( + [ + "gcloud", + "container", + "images", + "delete", + image_name, + f"--project={project}", + "--quiet", + ], + check=True, + ) + + +def dataflow_job_id_from_job_name( + job_name: str, project: str = PROJECT +) -> Optional[str]: + # Only return the 50 most recent results - our job is likely to be in here. + # If the job is not found, first try increasing this number. + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + jobs_request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + filter="ACTIVE", + pageSize=50, ) - - @staticmethod - def dataflow_job_id_from_job_name( - job_name: str, project: str = PROJECT - ) -> Optional[str]: - # Only return the 50 most recent results - our job is likely to be in here. - # If the job is not found, first try increasing this number. - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list - jobs_request = ( + ) + response = jobs_request.execute() + + # Search for the job in the list that has our name (names are unique) + for job in response["jobs"]: + if job["name"] == job_name: + return job["id"] + return None + + +@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) +def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: + # To cancel a dataflow job, we need its ID, not its name + job_id = dataflow_job_id_from_job_name(project, job_name) + + if job_id is not None: + # Cancel the Dataflow job if it exists. + # If it doesn't, job_id will be equal to None. + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + request = ( dataflow.projects() .jobs() - .list( + .update( projectId=project, - filter="ACTIVE", - pageSize=50, - ) - ) - response = jobs_request.execute() - - # Search for the job in the list that has our name (names are unique) - for job in response["jobs"]: - if job["name"] == job_name: - return job["id"] - return None - - @staticmethod - @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) - def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: - # To cancel a dataflow job, we need its ID, not its name - job_id = Utils.dataflow_job_id_from_job_name(project, job_name) - - if job_id is not None: - # Cancel the Dataflow job if it exists. - # If it doesn't, job_id will be equal to None. - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - body={"requestedState": "JOB_STATE_CANCELLED"}, - ) + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, ) - request.execute() - - @staticmethod - def dataflow_flex_template_build( - bucket_name: str, - template_image: str, - metadata_file: str, - project: str = PROJECT, - template_file: str = "template.json", - ) -> str: - subprocess.call( - [ - "gcloud", - "dataflow", - "flex-template", - "build", - f"gs://{bucket_name}/{template_file}", - f"--project={project}", - f"--image={template_image}", - "--sdk-language=PYTHON", - f"--metadata-file={metadata_file}", - ], - check=True, - ) - - yield f"gs://{bucket_name}/{template_file}" - - storage_client = storage.Client() - storage_client.bucket(bucket_name).blob(template_file).delete() - - @staticmethod - def dataflow_flex_template_run( - job_name: str, - template_path: str, - bucket_name: str, - parameters: Dict[str, str] = {}, - project: str = PROJECT, - region: str = REGION, - ) -> str: - unique_job_name = f"{job_name}-{Utils.uuid}" - subprocess.call( - [ - "gcloud", - "dataflow", - "flex-template", - "run", - unique_job_name, - f"--template-file-gcs-location={template_path}", - f"--project={project}", - f"--region={region}", - f"--temp_location=gs://{bucket_name}/temp", - ] - + [f"--parameters={name}={value}" for name, value in parameters.items()], - check=True, ) - - yield unique_job_name - - -@pytest.fixture(scope="session") -def utils(): - return Utils() + request.execute() + + +@staticmethod +def dataflow_flex_template_build( + bucket_name: str, + template_image: str, + metadata_file: str, + project: str = PROJECT, + template_file: str = "template.json", +) -> str: + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "build", + f"gs://{bucket_name}/{template_file}", + f"--project={project}", + f"--image={template_image}", + "--sdk-language=PYTHON", + f"--metadata-file={metadata_file}", + ], + check=True, + ) + + yield f"gs://{bucket_name}/{template_file}" + + storage_client = storage.Client() + storage_client.bucket(bucket_name).blob(template_file).delete() + + +def dataflow_flex_template_run( + job_name: str, + template_path: str, + bucket_name: str, + parameters: Dict[str, str] = {}, + project: str = PROJECT, + region: str = REGION, +) -> str: + unique_job_name = f"{job_name}-{UUID}" + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "run", + unique_job_name, + f"--template-file-gcs-location={template_path}", + f"--project={project}", + f"--region={region}", + f"--temp_location=gs://{bucket_name}/temp", + ] + + [f"--parameters={name}={value}" for name, value in parameters.items()], + check=True, + ) + + yield unique_job_name diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py index e7ed7703169..ffc78f34e19 100644 --- a/dataflow/flex-templates/__init__.py +++ b/dataflow/flex-templates/__init__.py @@ -9,5 +9,3 @@ # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -from .. import testing_utils diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 1072765c351..b366ff90b40 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -11,36 +11,37 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. import json -from conftest import Utils import time import pytest +import conftest as utils + +NAME = "dataflow-flex-templates-streaming-beam" + @pytest.fixture(scope="session") -def bucket_name(utils: Utils) -> str: - return utils.storage_bucket("dataflow-flex-templates-streaming-beam") +def bucket_name() -> str: + return utils.storage_bucket(NAME) @pytest.fixture(scope="session") -def pubsub_topic(utils: Utils) -> str: - return utils.pubsub_topic("dataflow-flex-templates-streaming-beam") +def pubsub_topic() -> str: + return utils.pubsub_topic(NAME) @pytest.fixture(scope="session") -def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str: - return utils.pubsub_subscription( - pubsub_topic, "dataflow-flex-templates-streaming-beam" - ) +def pubsub_subscription(pubsub_topic: str) -> str: + return utils.pubsub_subscription(pubsub_topic, NAME) @pytest.fixture(scope="session") -def bigquery_dataset(utils: Utils) -> str: - return utils.bigquery_dataset("dataflow_flex_templates") +def bigquery_dataset() -> str: + return utils.bigquery_dataset(NAME.replace("-", "_")) @pytest.fixture(scope="session") -def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: +def pubsub_publisher(pubsub_topic: str) -> bool: return utils.pubsub_publisher( pubsub_topic, new_msg=lambda i: json.dumps( @@ -53,12 +54,12 @@ def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: @pytest.fixture(scope="session") -def flex_template_image(utils: Utils) -> str: - return utils.container_image(f"dataflow/flex-templates/streaming-beam") +def flex_template_image() -> str: + return utils.container_image(NAME) @pytest.fixture(scope="session") -def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: +def flex_template_path(bucket_name: str, flex_template_image: str) -> str: return utils.dataflow_flex_template_build( bucket_name=bucket_name, template_image=flex_template_image, @@ -67,7 +68,6 @@ def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) def test_run_template( - utils: Utils, bucket_name: str, pubsub_publisher: str, pubsub_subscription: str, @@ -75,9 +75,9 @@ def test_run_template( bigquery_dataset: str, ) -> None: - bigquery_table = "streaming_beam" + bigquery_table = "output_table" job_name = utils.dataflow_flex_template_run( - job_name="flex-templates-streaming-beam", + job_name=NAME, template_path=flex_template_path, bucket_name=bucket_name, parameters={ diff --git a/dataflow/run_template/main_test.py b/dataflow/run_template/main_test.py index 6a5b9792692..1c20aeed1b4 100644 --- a/dataflow/run_template/main_test.py +++ b/dataflow/run_template/main_test.py @@ -31,15 +31,15 @@ from werkzeug.urls import url_encode -import main +from . import main RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds -PROJECT = os.environ['GOOGLE_CLOUD_PROJECT'] -BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +BUCKET = os.environ["CLOUD_STORAGE_BUCKET"] -dataflow = build('dataflow', 'v1b3') +dataflow = build("dataflow", "v1b3") # Create a fake "app" for generating test request contexts. @@ -53,8 +53,9 @@ def app(): @pytest.fixture(scope="function") def dataflow_job_name(request): label = request.param - job_name = datetime.now().strftime('{}-%Y%m%d-%H%M%S-{}'.format( - label, uuid.uuid4().hex[:5])) + job_name = datetime.now().strftime( + "{}-%Y%m%d-%H%M%S-{}".format(label, uuid.uuid4().hex[:5]) + ) yield job_name @@ -69,17 +70,21 @@ def dataflow_job_name(request): # Takes in a Dataflow job name and returns its job ID def get_job_id_from_name(job_name): # list the 50 most recent Dataflow jobs - jobs_request = dataflow.projects().jobs().list( - projectId=PROJECT, - filter="ACTIVE", - pageSize=50 # only return the 50 most recent results - our job is likely to be in here. If the job is not found, first try increasing this number. For more info see:https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + jobs_request = ( + dataflow.projects() + .jobs() + .list( + projectId=PROJECT, + filter="ACTIVE", + pageSize=50, # only return the 50 most recent results - our job is likely to be in here. If the job is not found, first try increasing this number. For more info see:https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + ) ) response = jobs_request.execute() # search for the job in the list that has our name (names are unique) - for job in response['jobs']: - if job['name'] == job_name: - return job['id'] + for job in response["jobs"]: + if job["name"] == job_name: + return job["id"] # if we don't find a job, just return return @@ -92,32 +97,40 @@ def dataflow_jobs_cancel(job_name): if job_id: # Cancel the Dataflow job if it exists. If it doesn't, job_id will be equal to None. For more info, see: https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - request = dataflow.projects().jobs().update( - projectId=PROJECT, - jobId=job_id, - body={'requestedState': 'JOB_STATE_CANCELLED'} + request = ( + dataflow.projects() + .jobs() + .update( + projectId=PROJECT, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) ) request.execute() -@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_empty')], indirect=True) +@pytest.mark.parametrize( + "dataflow_job_name", [("test_run_template_empty")], indirect=True +) def test_run_template_python_empty_args(app, dataflow_job_name): project = PROJECT - template = 'gs://dataflow-templates/latest/Word_Count' + template = "gs://dataflow-templates/latest/Word_Count" with pytest.raises(HttpError): main.run(project, dataflow_job_name, template) -@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_python')], indirect=True) +@pytest.mark.parametrize( + "dataflow_job_name", [("test_run_template_python")], indirect=True +) def test_run_template_python(app, dataflow_job_name): project = PROJECT - template = 'gs://dataflow-templates/latest/Word_Count' + template = "gs://dataflow-templates/latest/Word_Count" parameters = { - 'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt', - 'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET), + "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt", + "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET), } res = main.run(project, dataflow_job_name, template, parameters) - assert 'test_run_template_python' in res['job']['name'] + assert "test_run_template_python" in res["job"]["name"] def test_run_template_http_empty_args(app): @@ -126,46 +139,52 @@ def test_run_template_http_empty_args(app): main.run_template(flask.request) -@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_url')], indirect=True) +@pytest.mark.parametrize( + "dataflow_job_name", [("test_run_template_url")], indirect=True +) def test_run_template_http_url(app, dataflow_job_name): args = { - 'project': PROJECT, - 'job': dataflow_job_name, - 'template': 'gs://dataflow-templates/latest/Word_Count', - 'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt', - 'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET), + "project": PROJECT, + "job": dataflow_job_name, + "template": "gs://dataflow-templates/latest/Word_Count", + "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt", + "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET), } - with app.test_request_context('/?' + url_encode(args)): + with app.test_request_context("/?" + url_encode(args)): res = main.run_template(flask.request) data = json.loads(res) - assert 'test_run_template_url' in data['job']['name'] + assert "test_run_template_url" in data["job"]["name"] -@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_data')], indirect=True) +@pytest.mark.parametrize( + "dataflow_job_name", [("test_run_template_data")], indirect=True +) def test_run_template_http_data(app, dataflow_job_name): args = { - 'project': PROJECT, - 'job': dataflow_job_name, - 'template': 'gs://dataflow-templates/latest/Word_Count', - 'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt', - 'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET), + "project": PROJECT, + "job": dataflow_job_name, + "template": "gs://dataflow-templates/latest/Word_Count", + "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt", + "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET), } with app.test_request_context(data=args): res = main.run_template(flask.request) data = json.loads(res) - assert 'test_run_template_data' in data['job']['name'] + assert "test_run_template_data" in data["job"]["name"] -@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_json')], indirect=True) +@pytest.mark.parametrize( + "dataflow_job_name", [("test_run_template_json")], indirect=True +) def test_run_template_http_json(app, dataflow_job_name): args = { - 'project': PROJECT, - 'job': dataflow_job_name, - 'template': 'gs://dataflow-templates/latest/Word_Count', - 'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt', - 'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET), + "project": PROJECT, + "job": dataflow_job_name, + "template": "gs://dataflow-templates/latest/Word_Count", + "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt", + "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET), } with app.test_request_context(json=args): res = main.run_template(flask.request) data = json.loads(res) - assert 'test_run_template_json' in data['job']['name'] + assert "test_run_template_json" in data["job"]["name"] From e31334c5a517b384e1477f2ccc1cc57247546ec3 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 14:03:21 -0700 Subject: [PATCH 09/57] fix import order --- dataflow/flex-templates/streaming_beam/e2e_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index b366ff90b40..ebbb7f9a568 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -13,9 +13,8 @@ import json import time -import pytest - import conftest as utils +import pytest NAME = "dataflow-flex-templates-streaming-beam" From b157d5443d34fd9903e3612ee9ee7944ce9b9616 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 14:07:26 -0700 Subject: [PATCH 10/57] add google-cloud-storage --- dataflow/flex-templates/streaming_beam/requirements-test.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt index 16def8b7d52..0cee7f38e5a 100644 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt @@ -1,4 +1,5 @@ backoff==1.10.0 google-api-python-client==2.1.0 +google-cloud-storage==1.38.0 pytest-xdist==2.2.1 -pytest==6.2.1 +pytest==6.2.1 \ No newline at end of file From 48232b60acb9de7ee3647d4e4810a1e754caeb12 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 14:36:44 -0700 Subject: [PATCH 11/57] make external library imports local --- dataflow/conftest.py | 80 +++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 8bbb4d59f31..2de0a6e825a 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -16,18 +16,9 @@ import multiprocessing as mp import os import subprocess -from typing import Callable, Dict, Optional +from typing import Any, Callable, Dict, Iterable, Optional import uuid -import backoff -from google.cloud import bigquery -from google.cloud import pubsub -from google.cloud import storage -from google.cloud.bigquery.table import RowIterator -from googleapiclient.discovery import build -from googleapiclient.errors import HttpError - -dataflow = build("dataflow", "v1b3") # Default options. UUID = uuid.uuid4().hex[0:6] @@ -39,6 +30,8 @@ def storage_bucket(bucket_name: str) -> str: + from google.cloud import storage + storage_client = storage.Client() bucket_unique_name = f"{bucket_name}-{UUID}" bucket = storage_client.create_bucket(bucket_unique_name) @@ -50,6 +43,8 @@ def storage_bucket(bucket_name: str) -> str: def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: + from google.cloud import bigquery + bigquery_client = bigquery.Client() dataset = bigquery_client.create_dataset( bigquery.Dataset(f"{project}.{dataset_name}_{UUID}") @@ -61,13 +56,17 @@ def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) -def bigquery_query(query: str) -> RowIterator: +def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: + from google.cloud import bigquery + bigquery_client = bigquery.Client() - query_job = bigquery_client.query(query) - return query_job.result() + for row in bigquery_client.query(query): + yield dict(row) def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: + from google.cloud import pubsub + publisher_client = pubsub.PublisherClient() topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") topic = publisher_client.create_topic(topic_path) @@ -88,6 +87,8 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: def pubsub_subscription( topic_path: str, subscription_name: str, project: str = PROJECT ) -> str: + from google.cloud import pubsub + subscriber = pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, f"{subscription_name}-{UUID}" @@ -121,6 +122,8 @@ def pubsub_publisher( {"id": i, "content": f"message {i}"} ), ) -> bool: + from google.cloud import pubsub + def _infinite_publish_job() -> None: publisher_client = pubsub.PublisherClient() for i in itertools.count(): @@ -176,6 +179,10 @@ def container_image( def dataflow_job_id_from_job_name( job_name: str, project: str = PROJECT ) -> Optional[str]: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + # Only return the 50 most recent results - our job is likely to be in here. # If the job is not found, first try increasing this number. # For more info see: @@ -198,26 +205,35 @@ def dataflow_job_id_from_job_name( return None -@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: - # To cancel a dataflow job, we need its ID, not its name - job_id = dataflow_job_id_from_job_name(project, job_name) - - if job_id is not None: - # Cancel the Dataflow job if it exists. - # If it doesn't, job_id will be equal to None. - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - body={"requestedState": "JOB_STATE_CANCELLED"}, + import backoff + from googleapiclient.discovery import build + from googleapiclient.errors import HttpError + + dataflow = build("dataflow", "v1b3") + + @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) + def cancel(): + # To cancel a dataflow job, we need its ID, not its name + job_id = dataflow_job_id_from_job_name(project, job_name) + + if job_id is not None: + # Cancel the Dataflow job if it exists. + # If it doesn't, job_id will be equal to None. + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + request = ( + dataflow.projects() + .jobs() + .update( + projectId=project, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) ) - ) - request.execute() + request.execute() + + cancel() @staticmethod @@ -228,6 +244,8 @@ def dataflow_flex_template_build( project: str = PROJECT, template_file: str = "template.json", ) -> str: + from google.cloud import storage + subprocess.call( [ "gcloud", From b3921af78f9e42fd558f6ba95d5fd5dd5264f44a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 14:36:57 -0700 Subject: [PATCH 12/57] update checks --- dataflow/flex-templates/streaming_beam/e2e_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index ebbb7f9a568..60683a1bfcf 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -92,7 +92,7 @@ def test_run_template( # Check for output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" - rows = utils.bigquery_query(query) - assert rows.total_rows > 0 + rows = list(utils.bigquery_query(query)) + assert len(rows) > 0 for row in rows: assert "score" in row From 1e1b0c0c24c8c7a748df9cf578614d1e8ffa6d8d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 27 Apr 2021 14:44:03 -0700 Subject: [PATCH 13/57] access through fixture --- dataflow/conftest.py | 505 +++++++++--------- .../flex-templates/streaming_beam/e2e_test.py | 20 +- 2 files changed, 273 insertions(+), 252 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 2de0a6e825a..a7e6f713419 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -10,6 +10,7 @@ # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from dataclasses import dataclass from datetime import time import itertools import json @@ -19,6 +20,7 @@ from typing import Any, Callable, Dict, Iterable, Optional import uuid +import pytest # Default options. UUID = uuid.uuid4().hex[0:6] @@ -29,267 +31,282 @@ RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds -def storage_bucket(bucket_name: str) -> str: - from google.cloud import storage +class Utils: + uuid: str = UUID + project: str = PROJECT + region: str = REGION + zone: str = ZONE - storage_client = storage.Client() - bucket_unique_name = f"{bucket_name}-{UUID}" - bucket = storage_client.create_bucket(bucket_unique_name) + @staticmethod + def storage_bucket(bucket_name: str) -> str: + from google.cloud import storage - print(f"storage_bucket: {repr(bucket_unique_name)}") - yield bucket_unique_name + storage_client = storage.Client() + bucket_unique_name = f"{bucket_name}-{UUID}" + bucket = storage_client.create_bucket(bucket_unique_name) - bucket.delete(force=True) + print(f"storage_bucket: {repr(bucket_unique_name)}") + yield bucket_unique_name + bucket.delete(force=True) -def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: - from google.cloud import bigquery + @staticmethod + def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: + from google.cloud import bigquery - bigquery_client = bigquery.Client() - dataset = bigquery_client.create_dataset( - bigquery.Dataset(f"{project}.{dataset_name}_{UUID}") - ) - - print(f"bigquery_dataset: {dataset.full_dataset_id}") - yield dataset.full_dataset_id - - bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) - - -def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: - from google.cloud import bigquery + bigquery_client = bigquery.Client() + dataset = bigquery_client.create_dataset( + bigquery.Dataset(f"{project}.{dataset_name}_{UUID}") + ) - bigquery_client = bigquery.Client() - for row in bigquery_client.query(query): - yield dict(row) + print(f"bigquery_dataset: {dataset.full_dataset_id}") + yield dataset.full_dataset_id + bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) -def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: - from google.cloud import pubsub + @staticmethod + def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: + from google.cloud import bigquery - publisher_client = pubsub.PublisherClient() - topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") - topic = publisher_client.create_topic(topic_path) + bigquery_client = bigquery.Client() + for row in bigquery_client.query(query): + yield dict(row) - print(f"pubsub_topic: {repr(topic.name)}") - yield topic.name + @staticmethod + def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: + from google.cloud import pubsub - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( - ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], - check=True, - ) + publisher_client = pubsub.PublisherClient() + topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") + topic = publisher_client.create_topic(topic_path) + + print(f"pubsub_topic: {repr(topic.name)}") + yield topic.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], + check=True, + ) + @staticmethod + def pubsub_subscription( + topic_path: str, + subscription_name: str, + project: str = PROJECT, + ) -> str: + from google.cloud import pubsub + + subscriber = pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path( + project, f"{subscription_name}-{UUID}" + ) + subscription = subscriber.create_subscription(subscription_path, topic_path) + + print(f"pubsub_subscription: {repr(subscription.name)}") + yield subscription.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + subprocess.check_call( + [ + "gcloud", + "pubsub", + "--project", + project, + "subscriptions", + "delete", + subscription_name, + ], + check=True, + ) -def pubsub_subscription( - topic_path: str, subscription_name: str, project: str = PROJECT -) -> str: - from google.cloud import pubsub + @staticmethod + def pubsub_publisher( + topic_path: str, + new_msg: Callable[[int], str] = lambda i: json.dumps( + {"id": i, "content": f"message {i}"} + ), + ) -> bool: + from google.cloud import pubsub + + def _infinite_publish_job() -> None: + publisher_client = pubsub.PublisherClient() + for i in itertools.count(): + publisher_client.publish( + topic_path, new_msg(i).encode("utf-8") + ).result() + time.sleep(1) + + # Start a subprocess in the background to do the publishing. + p = mp.Process(target=_infinite_publish_job) + p.start() + + yield p.is_alive() + + # For cleanup, terminate the background process. + p.join(timeout=0) + p.terminate() + + @staticmethod + def container_image( + image_path: str, + project: str = PROJECT, + tag: str = "latest", + ) -> str: + image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}" + subprocess.run(["gcloud", "auth", "configure-docker"], check=True) + subprocess.run( + [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag={image_name}", + ".", + ], + check=True, + ) - subscriber = pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, f"{subscription_name}-{UUID}" - ) - subscription = subscriber.create_subscription(subscription_path, topic_path) + yield image_name + + subprocess.run( + [ + "gcloud", + "container", + "images", + "delete", + image_name, + f"--project={project}", + "--quiet", + ], + check=True, + ) - print(f"pubsub_subscription: {repr(subscription.name)}") - yield subscription.name + @staticmethod + def dataflow_job_id_from_job_name( + job_name: str, + project: str = PROJECT, + ) -> Optional[str]: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + + # Only return the 50 most recent results - our job is likely to be in here. + # If the job is not found, first try increasing this number. + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + jobs_request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + filter="ACTIVE", + pageSize=50, + ) + ) + response = jobs_request.execute() + + # Search for the job in the list that has our name (names are unique) + for job in response["jobs"]: + if job["name"] == job_name: + return job["id"] + return None + + @staticmethod + def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: + import backoff + from googleapiclient.discovery import build + from googleapiclient.errors import HttpError + + dataflow = build("dataflow", "v1b3") + + @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) + def cancel(): + # To cancel a dataflow job, we need its ID, not its name + job_id = Utils.dataflow_job_id_from_job_name(project, job_name) + + if job_id is not None: + # Cancel the Dataflow job if it exists. + # If it doesn't, job_id will be equal to None. + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + request = ( + dataflow.projects() + .jobs() + .update( + projectId=project, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) + ) + request.execute() + + cancel() + + @staticmethod + def dataflow_flex_template_build( + bucket_name: str, + template_image: str, + metadata_file: str, + project: str = PROJECT, + template_file: str = "template.json", + ) -> str: + from google.cloud import storage + + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "build", + f"gs://{bucket_name}/{template_file}", + f"--project={project}", + f"--image={template_image}", + "--sdk-language=PYTHON", + f"--metadata-file={metadata_file}", + ], + check=True, + ) - # Due to the pinned library dependencies in apache-beam, client - # library throws an error upon deletion. - # We use gcloud for a workaround. See also: - # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( - [ - "gcloud", - "pubsub", - "--project", - project, - "subscriptions", - "delete", - subscription_name, - ], - check=True, - ) + yield f"gs://{bucket_name}/{template_file}" + + storage_client = storage.Client() + storage_client.bucket(bucket_name).blob(template_file).delete() + + def dataflow_flex_template_run( + job_name: str, + template_path: str, + bucket_name: str, + parameters: Dict[str, str] = {}, + project: str = PROJECT, + region: str = REGION, + ) -> str: + unique_job_name = f"{job_name}-{UUID}" + subprocess.call( + [ + "gcloud", + "dataflow", + "flex-template", + "run", + unique_job_name, + f"--template-file-gcs-location={template_path}", + f"--project={project}", + f"--region={region}", + f"--temp_location=gs://{bucket_name}/temp", + ] + + [f"--parameters={name}={value}" for name, value in parameters.items()], + check=True, + ) + yield unique_job_name -def pubsub_publisher( - topic_path: str, - new_msg: Callable[[int], str] = lambda i: json.dumps( - {"id": i, "content": f"message {i}"} - ), -) -> bool: - from google.cloud import pubsub - def _infinite_publish_job() -> None: - publisher_client = pubsub.PublisherClient() - for i in itertools.count(): - publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result() - time.sleep(1) - - # Start a subprocess in the background to do the publishing. - p = mp.Process(target=_infinite_publish_job) - p.start() - - yield p.is_alive() - - # For cleanup, terminate the background process. - p.join(timeout=0) - p.terminate() - - -def container_image( - image_path: str, - project: str = PROJECT, - tag: str = "latest", -) -> str: - image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}" - subprocess.run(["gcloud", "auth", "configure-docker"], check=True) - subprocess.run( - [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--tag={image_name}", - ".", - ], - check=True, - ) - - yield image_name - - subprocess.run( - [ - "gcloud", - "container", - "images", - "delete", - image_name, - f"--project={project}", - "--quiet", - ], - check=True, - ) - - -def dataflow_job_id_from_job_name( - job_name: str, project: str = PROJECT -) -> Optional[str]: - from googleapiclient.discovery import build - - dataflow = build("dataflow", "v1b3") - - # Only return the 50 most recent results - our job is likely to be in here. - # If the job is not found, first try increasing this number. - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list - jobs_request = ( - dataflow.projects() - .jobs() - .list( - projectId=project, - filter="ACTIVE", - pageSize=50, - ) - ) - response = jobs_request.execute() - - # Search for the job in the list that has our name (names are unique) - for job in response["jobs"]: - if job["name"] == job_name: - return job["id"] - return None - - -def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: - import backoff - from googleapiclient.discovery import build - from googleapiclient.errors import HttpError - - dataflow = build("dataflow", "v1b3") - - @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) - def cancel(): - # To cancel a dataflow job, we need its ID, not its name - job_id = dataflow_job_id_from_job_name(project, job_name) - - if job_id is not None: - # Cancel the Dataflow job if it exists. - # If it doesn't, job_id will be equal to None. - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - body={"requestedState": "JOB_STATE_CANCELLED"}, - ) - ) - request.execute() - - cancel() - - -@staticmethod -def dataflow_flex_template_build( - bucket_name: str, - template_image: str, - metadata_file: str, - project: str = PROJECT, - template_file: str = "template.json", -) -> str: - from google.cloud import storage - - subprocess.call( - [ - "gcloud", - "dataflow", - "flex-template", - "build", - f"gs://{bucket_name}/{template_file}", - f"--project={project}", - f"--image={template_image}", - "--sdk-language=PYTHON", - f"--metadata-file={metadata_file}", - ], - check=True, - ) - - yield f"gs://{bucket_name}/{template_file}" - - storage_client = storage.Client() - storage_client.bucket(bucket_name).blob(template_file).delete() - - -def dataflow_flex_template_run( - job_name: str, - template_path: str, - bucket_name: str, - parameters: Dict[str, str] = {}, - project: str = PROJECT, - region: str = REGION, -) -> str: - unique_job_name = f"{job_name}-{UUID}" - subprocess.call( - [ - "gcloud", - "dataflow", - "flex-template", - "run", - unique_job_name, - f"--template-file-gcs-location={template_path}", - f"--project={project}", - f"--region={region}", - f"--temp_location=gs://{bucket_name}/temp", - ] - + [f"--parameters={name}={value}" for name, value in parameters.items()], - check=True, - ) - - yield unique_job_name +@pytest.fixture +def utils() -> Utils: + return Utils() diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 60683a1bfcf..bbc1cfd10f3 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -13,34 +13,37 @@ import json import time -import conftest as utils +try: + from conftest import Utils +except ModuleNotFoundError: + pass import pytest NAME = "dataflow-flex-templates-streaming-beam" @pytest.fixture(scope="session") -def bucket_name() -> str: +def bucket_name(utils: Utils) -> str: return utils.storage_bucket(NAME) @pytest.fixture(scope="session") -def pubsub_topic() -> str: +def pubsub_topic(utils: Utils) -> str: return utils.pubsub_topic(NAME) @pytest.fixture(scope="session") -def pubsub_subscription(pubsub_topic: str) -> str: +def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str: return utils.pubsub_subscription(pubsub_topic, NAME) @pytest.fixture(scope="session") -def bigquery_dataset() -> str: +def bigquery_dataset(utils: Utils) -> str: return utils.bigquery_dataset(NAME.replace("-", "_")) @pytest.fixture(scope="session") -def pubsub_publisher(pubsub_topic: str) -> bool: +def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: return utils.pubsub_publisher( pubsub_topic, new_msg=lambda i: json.dumps( @@ -53,12 +56,12 @@ def pubsub_publisher(pubsub_topic: str) -> bool: @pytest.fixture(scope="session") -def flex_template_image() -> str: +def flex_template_image(utils: Utils) -> str: return utils.container_image(NAME) @pytest.fixture(scope="session") -def flex_template_path(bucket_name: str, flex_template_image: str) -> str: +def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: return utils.dataflow_flex_template_build( bucket_name=bucket_name, template_image=flex_template_image, @@ -67,6 +70,7 @@ def flex_template_path(bucket_name: str, flex_template_image: str) -> str: def test_run_template( + utils: Utils, bucket_name: str, pubsub_publisher: str, pubsub_subscription: str, From d1faf4abbc4c223a47d840c3b5e2f2e8a6c53247 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 29 Apr 2021 10:36:00 -0700 Subject: [PATCH 14/57] add notes --- dataflow/flex-templates/streaming_beam/e2e_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index bbc1cfd10f3..f408bb27394 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -16,7 +16,11 @@ try: from conftest import Utils except ModuleNotFoundError: - pass + from typing import Any + + # `conftest` cannot be imported when running in `nox`, but we still + # try to import it for the autocomplete when writing the tests. + Utils = Any import pytest NAME = "dataflow-flex-templates-streaming-beam" From c7ff583be5dae6d456e0a556583bb0cd256ddb45 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 29 Apr 2021 13:41:13 -0700 Subject: [PATCH 15/57] make session scoped --- dataflow/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index a7e6f713419..5a83d7aacab 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -307,6 +307,6 @@ def dataflow_flex_template_run( yield unique_job_name -@pytest.fixture +@pytest.fixture(scope="session") def utils() -> Utils: return Utils() From aefd54ccbc652b5efd1baf9b3ced638a9c1032d2 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 10:41:02 -0700 Subject: [PATCH 16/57] flex_template_run returns instead of yield --- dataflow/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 5a83d7aacab..b3bc31e0776 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -279,6 +279,7 @@ def dataflow_flex_template_build( storage_client = storage.Client() storage_client.bucket(bucket_name).blob(template_file).delete() + @staticmethod def dataflow_flex_template_run( job_name: str, template_path: str, @@ -304,7 +305,7 @@ def dataflow_flex_template_run( check=True, ) - yield unique_job_name + return unique_job_name @pytest.fixture(scope="session") From 1964725614632cfed631b4f8f63c4c1d6a73b916 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 10:43:04 -0700 Subject: [PATCH 17/57] document try imports --- dataflow/flex-templates/streaming_beam/e2e_test.py | 4 ++-- dataflow/run_template/main_test.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index f408bb27394..afa2cb77ead 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -13,13 +13,13 @@ import json import time +# `conftest` cannot be imported when running in `nox`, but we still +# try to import it for the autocomplete when writing the tests. try: from conftest import Utils except ModuleNotFoundError: from typing import Any - # `conftest` cannot be imported when running in `nox`, but we still - # try to import it for the autocomplete when writing the tests. Utils = Any import pytest diff --git a/dataflow/run_template/main_test.py b/dataflow/run_template/main_test.py index 1c20aeed1b4..d5d7eaf8195 100644 --- a/dataflow/run_template/main_test.py +++ b/dataflow/run_template/main_test.py @@ -31,7 +31,12 @@ from werkzeug.urls import url_encode -from . import main +# Relative imports cannot be found when running in `nox`, but we still +# try to import it for the autocomplete when writing the tests. +try: + from . import main +except ModuleNotFoundError: + import main RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds From dd10c9aad5a1f6444cb20d7cd8560d1d99506c69 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 12:29:43 -0700 Subject: [PATCH 18/57] make dataclass --- dataflow/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index b3bc31e0776..63fb7654170 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -31,6 +31,7 @@ RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds +@dataclass class Utils: uuid: str = UUID project: str = PROJECT From 7b79e01630b9d5dba479caf777ec68d0e6d558ac Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 12:30:00 -0700 Subject: [PATCH 19/57] fix exception type --- dataflow/run_template/main_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/run_template/main_test.py b/dataflow/run_template/main_test.py index d5d7eaf8195..0887f1b7817 100644 --- a/dataflow/run_template/main_test.py +++ b/dataflow/run_template/main_test.py @@ -35,7 +35,7 @@ # try to import it for the autocomplete when writing the tests. try: from . import main -except ModuleNotFoundError: +except ImportError: import main From 2f7ca670f59639f5fabe7853a1728a2237eb85dc Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 12:32:15 -0700 Subject: [PATCH 20/57] make subprocess.run --- dataflow/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 63fb7654170..6abb859f035 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -260,7 +260,7 @@ def dataflow_flex_template_build( ) -> str: from google.cloud import storage - subprocess.call( + subprocess.run( [ "gcloud", "dataflow", @@ -290,7 +290,7 @@ def dataflow_flex_template_run( region: str = REGION, ) -> str: unique_job_name = f"{job_name}-{UUID}" - subprocess.call( + subprocess.run( [ "gcloud", "dataflow", From aaa25ea7e99c961facdb6c357e8da4655308afc2 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 15:04:46 -0700 Subject: [PATCH 21/57] use yield from --- dataflow/flex-templates/streaming_beam/e2e_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index afa2cb77ead..2ca3667c321 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -28,27 +28,27 @@ @pytest.fixture(scope="session") def bucket_name(utils: Utils) -> str: - return utils.storage_bucket(NAME) + yield from utils.storage_bucket(NAME) @pytest.fixture(scope="session") def pubsub_topic(utils: Utils) -> str: - return utils.pubsub_topic(NAME) + yield from utils.pubsub_topic(NAME) @pytest.fixture(scope="session") def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str: - return utils.pubsub_subscription(pubsub_topic, NAME) + yield from utils.pubsub_subscription(pubsub_topic, NAME) @pytest.fixture(scope="session") def bigquery_dataset(utils: Utils) -> str: - return utils.bigquery_dataset(NAME.replace("-", "_")) + yield from utils.bigquery_dataset(NAME.replace("-", "_")) @pytest.fixture(scope="session") def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: - return utils.pubsub_publisher( + yield from utils.pubsub_publisher( pubsub_topic, new_msg=lambda i: json.dumps( { @@ -61,12 +61,12 @@ def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: @pytest.fixture(scope="session") def flex_template_image(utils: Utils) -> str: - return utils.container_image(NAME) + yield from utils.container_image(NAME) @pytest.fixture(scope="session") def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: - return utils.dataflow_flex_template_build( + yield from utils.dataflow_flex_template_build( bucket_name=bucket_name, template_image=flex_template_image, metadata_file="metadata.json", From a33d9a339ad227d8202d67ca9c900c65ab69b06f Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 30 Apr 2021 15:52:48 -0700 Subject: [PATCH 22/57] use subprocess.run --- dataflow/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 6abb859f035..8806e0f9db0 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -88,7 +88,7 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: # library throws an error upon deletion. # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( + subprocess.run( ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], check=True, ) @@ -114,7 +114,7 @@ def pubsub_subscription( # library throws an error upon deletion. # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.check_call( + subprocess.run( [ "gcloud", "pubsub", From 49fa5275be454ba3e2424fa17151a9328fcbbd9e Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 3 May 2021 16:22:28 -0700 Subject: [PATCH 23/57] add more logging and fix tests --- dataflow/conftest.py | 98 ++++++++++++------- .../flex-templates/streaming_beam/e2e_test.py | 4 +- 2 files changed, 67 insertions(+), 35 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 8806e0f9db0..ae1aa4b0773 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -11,12 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. from dataclasses import dataclass -from datetime import time import itertools import json import multiprocessing as mp import os import subprocess +import time from typing import Any, Callable, Dict, Iterable, Optional import uuid @@ -28,6 +28,7 @@ REGION = "us-west1" ZONE = "us-west1-b" +CONSOLE_URL = "https://console.cloud.google.com" RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds @@ -46,7 +47,8 @@ def storage_bucket(bucket_name: str) -> str: bucket_unique_name = f"{bucket_name}-{UUID}" bucket = storage_client.create_bucket(bucket_unique_name) - print(f"storage_bucket: {repr(bucket_unique_name)}") + print(f"storage_bucket: {bucket_unique_name}") + print(f"\t{CONSOLE_URL}/storage/browser/{bucket_unique_name}&project={PROJECT}") yield bucket_unique_name bucket.delete(force=True) @@ -81,7 +83,10 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") topic = publisher_client.create_topic(topic_path) - print(f"pubsub_topic: {repr(topic.name)}") + print(f"pubsub_topic: {topic.name}") + print( + f"\t{CONSOLE_URL}/cloudpubsub/topic/detail/{topic.name}&project={project}" + ) yield topic.name # Due to the pinned library dependencies in apache-beam, client @@ -107,7 +112,10 @@ def pubsub_subscription( ) subscription = subscriber.create_subscription(subscription_path, topic_path) - print(f"pubsub_subscription: {repr(subscription.name)}") + print(f"pubsub_subscription: {subscription.name}") + print( + f"\t{CONSOLE_URL}/cloudpubsub/subscription/detail/{subscription.name}&project={project}" + ) yield subscription.name # Due to the pinned library dependencies in apache-beam, client @@ -122,7 +130,7 @@ def pubsub_subscription( project, "subscriptions", "delete", - subscription_name, + subscription.name, ], check=True, ) @@ -139,18 +147,20 @@ def pubsub_publisher( def _infinite_publish_job() -> None: publisher_client = pubsub.PublisherClient() for i in itertools.count(): - publisher_client.publish( - topic_path, new_msg(i).encode("utf-8") - ).result() + msg = new_msg(i) + print(f">> publish[{i}]: {repr(msg)}") + publisher_client.publish(topic_path, msg.encode("utf-8")).result() time.sleep(1) # Start a subprocess in the background to do the publishing. + print(f"Starting publisher on {topic_path}") p = mp.Process(target=_infinite_publish_job) p.start() yield p.is_alive() # For cleanup, terminate the background process. + print("Stopping publisher") p.join(timeout=0) p.terminate() @@ -174,6 +184,10 @@ def container_image( check=True, ) + print(f"container_image: {image_name}") + print( + f"\t{CONSOLE_URL}/gcr/images/{project}/GLOBAL/{image_path}?project={project}" + ) yield image_name subprocess.run( @@ -199,7 +213,7 @@ def dataflow_job_id_from_job_name( dataflow = build("dataflow", "v1b3") # Only return the 50 most recent results - our job is likely to be in here. - # If the job is not found, first try increasing this number. + # If the job is not found, first try increasing this number.[]''job_id # For more info see: # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list jobs_request = ( @@ -220,7 +234,27 @@ def dataflow_job_id_from_job_name( return None @staticmethod - def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: + def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + print(f"Canceling Dataflow job ID: {job_id}") + request = ( + dataflow.projects() + .jobs() + .update( + projectId=project, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) + ) + request.execute() + + @staticmethod + def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None: import backoff from googleapiclient.discovery import build from googleapiclient.errors import HttpError @@ -229,24 +263,11 @@ def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None: @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) def cancel(): - # To cancel a dataflow job, we need its ID, not its name + # To cancel a dataflow job, we need its ID, not its name. + # If it doesn't, job_id will be equal to None. job_id = Utils.dataflow_job_id_from_job_name(project, job_name) - if job_id is not None: - # Cancel the Dataflow job if it exists. - # If it doesn't, job_id will be equal to None. - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - body={"requestedState": "JOB_STATE_CANCELLED"}, - ) - ) - request.execute() + Utils.dataflow_jobs_cancel_by_job_id(job_id) cancel() @@ -260,13 +281,14 @@ def dataflow_flex_template_build( ) -> str: from google.cloud import storage - subprocess.run( + template_gcs_path = f"gs://{bucket_name}/{template_file}" + p = subprocess.run( [ "gcloud", "dataflow", "flex-template", "build", - f"gs://{bucket_name}/{template_file}", + template_gcs_path, f"--project={project}", f"--image={template_image}", "--sdk-language=PYTHON", @@ -275,7 +297,8 @@ def dataflow_flex_template_build( check=True, ) - yield f"gs://{bucket_name}/{template_file}" + print(f"dataflow_flex_template_build: {template_gcs_path}") + yield template_gcs_path storage_client = storage.Client() storage_client.bucket(bucket_name).blob(template_file).delete() @@ -290,7 +313,7 @@ def dataflow_flex_template_run( region: str = REGION, ) -> str: unique_job_name = f"{job_name}-{UUID}" - subprocess.run( + stdout = subprocess.run( [ "gcloud", "dataflow", @@ -304,9 +327,18 @@ def dataflow_flex_template_run( ] + [f"--parameters={name}={value}" for name, value in parameters.items()], check=True, - ) - - return unique_job_name + capture_output=True, + ).stdout.decode("utf-8") + + print(f"Launched Dataflow template job: {unique_job_name}") + print(stdout) + + try: + job_id = json.loads(stdout)["job_id"] + print(f"\t{CONSOLE_URL}/dataflow/jobs/{region}/{job_id}&project={project}") + except: + print(f"\t{CONSOLE_URL}/dataflow/jobs&project={project}") + return job_id @pytest.fixture(scope="session") diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 2ca3667c321..c94ac7b2741 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -83,7 +83,7 @@ def test_run_template( ) -> None: bigquery_table = "output_table" - job_name = utils.dataflow_flex_template_run( + job_id = utils.dataflow_flex_template_run( job_name=NAME, template_path=flex_template_path, bucket_name=bucket_name, @@ -96,7 +96,7 @@ def test_run_template( # Since this is a streaming job, it will never finish running. # Wait for 10 minutes, and then cancel the job. time.sleep(10 * 60) - utils.dataflow_jobs_cancel(job_name) + utils.dataflow_jobs_cancel_by_job_id(job_id) # Check for output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" From 63abad2ae3e366a19e315db73da701a4b0b5e61b Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 3 May 2021 17:05:31 -0700 Subject: [PATCH 24/57] more fixes --- dataflow/conftest.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index ae1aa4b0773..b156da27555 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -94,7 +94,7 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 subprocess.run( - ["gcloud", "pubsub", "--project", project, "topics", "delete", topic], + ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name], check=True, ) @@ -141,6 +141,7 @@ def pubsub_publisher( new_msg: Callable[[int], str] = lambda i: json.dumps( {"id": i, "content": f"message {i}"} ), + sleep_sec: int = 1, ) -> bool: from google.cloud import pubsub @@ -148,9 +149,8 @@ def _infinite_publish_job() -> None: publisher_client = pubsub.PublisherClient() for i in itertools.count(): msg = new_msg(i) - print(f">> publish[{i}]: {repr(msg)}") publisher_client.publish(topic_path, msg.encode("utf-8")).result() - time.sleep(1) + time.sleep(sleep_sec) # Start a subprocess in the background to do the publishing. print(f"Starting publisher on {topic_path}") @@ -279,10 +279,8 @@ def dataflow_flex_template_build( project: str = PROJECT, template_file: str = "template.json", ) -> str: - from google.cloud import storage - template_gcs_path = f"gs://{bucket_name}/{template_file}" - p = subprocess.run( + subprocess.run( [ "gcloud", "dataflow", @@ -299,9 +297,7 @@ def dataflow_flex_template_build( print(f"dataflow_flex_template_build: {template_gcs_path}") yield template_gcs_path - - storage_client = storage.Client() - storage_client.bucket(bucket_name).blob(template_file).delete() + # The template file gets deleted when we delete the bucket. @staticmethod def dataflow_flex_template_run( From f0314e506716643279749ee20f27c5bb9500061d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 4 May 2021 11:25:58 -0700 Subject: [PATCH 25/57] add more logging --- dataflow/conftest.py | 138 +++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index b156da27555..b30af00ac57 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -93,10 +93,9 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: # library throws an error upon deletion. # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.run( - ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name], - check=True, - ) + cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name] + print(cmd) + subprocess.run(cmd, check=True) @staticmethod def pubsub_subscription( @@ -122,18 +121,17 @@ def pubsub_subscription( # library throws an error upon deletion. # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 - subprocess.run( - [ - "gcloud", - "pubsub", - "--project", - project, - "subscriptions", - "delete", - subscription.name, - ], - check=True, - ) + cmd = [ + "gcloud", + "pubsub", + "--project", + project, + "subscriptions", + "delete", + subscription.name, + ] + print(cmd) + subprocess.run(cmd, check=True) @staticmethod def pubsub_publisher( @@ -171,18 +169,19 @@ def container_image( tag: str = "latest", ) -> str: image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}" - subprocess.run(["gcloud", "auth", "configure-docker"], check=True) - subprocess.run( - [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--tag={image_name}", - ".", - ], - check=True, - ) + cmd = ["gcloud", "auth", "configure-docker"] + print(cmd) + subprocess.run(cmd, check=True) + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag={image_name}", + ".", + ] + print(cmd) + subprocess.run(cmd, check=True) print(f"container_image: {image_name}") print( @@ -190,18 +189,17 @@ def container_image( ) yield image_name - subprocess.run( - [ - "gcloud", - "container", - "images", - "delete", - image_name, - f"--project={project}", - "--quiet", - ], - check=True, - ) + cmd = [ + "gcloud", + "container", + "images", + "delete", + image_name, + f"--project={project}", + "--quiet", + ] + print(cmd) + subprocess.run(cmd, check=True) @staticmethod def dataflow_job_id_from_job_name( @@ -280,20 +278,19 @@ def dataflow_flex_template_build( template_file: str = "template.json", ) -> str: template_gcs_path = f"gs://{bucket_name}/{template_file}" - subprocess.run( - [ - "gcloud", - "dataflow", - "flex-template", - "build", - template_gcs_path, - f"--project={project}", - f"--image={template_image}", - "--sdk-language=PYTHON", - f"--metadata-file={metadata_file}", - ], - check=True, - ) + cmd = [ + "gcloud", + "dataflow", + "flex-template", + "build", + template_gcs_path, + f"--project={project}", + f"--image={template_image}", + "--sdk-language=PYTHON", + f"--metadata-file={metadata_file}", + ] + print(cmd) + subprocess.run(cmd, check=True) print(f"dataflow_flex_template_build: {template_gcs_path}") yield template_gcs_path @@ -309,25 +306,24 @@ def dataflow_flex_template_run( region: str = REGION, ) -> str: unique_job_name = f"{job_name}-{UUID}" - stdout = subprocess.run( - [ - "gcloud", - "dataflow", - "flex-template", - "run", - unique_job_name, - f"--template-file-gcs-location={template_path}", - f"--project={project}", - f"--region={region}", - f"--temp_location=gs://{bucket_name}/temp", - ] - + [f"--parameters={name}={value}" for name, value in parameters.items()], - check=True, - capture_output=True, - ).stdout.decode("utf-8") + cmd = [ + "gcloud", + "dataflow", + "flex-template", + "run", + unique_job_name, + f"--template-file-gcs-location={template_path}", + f"--project={project}", + f"--region={region}", + f"--temp_location=gs://{bucket_name}/temp", + ] + [f"--parameters={name}={value}" for name, value in parameters.items()] + print(cmd) + stdout = subprocess.run(cmd, check=True, capture_output=True).stdout.decode( + "utf-8" + ) + print(stdout) print(f"Launched Dataflow template job: {unique_job_name}") - print(stdout) try: job_id = json.loads(stdout)["job_id"] From 56359c5f817c110d3f49ca0057d9b034d3157d42 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 4 May 2021 11:28:10 -0700 Subject: [PATCH 26/57] print gcloud version --- dataflow/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index b30af00ac57..e35df34b3ae 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -335,4 +335,5 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: + subprocess.run(["gcloud", "--version"]) return Utils() From 9eb72ab7f27c3bdc59c4954cc85b1ad8243e7515 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 4 May 2021 11:44:24 -0700 Subject: [PATCH 27/57] add gcloudignore --- dataflow/flex-templates/streaming_beam/.gcloudignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 dataflow/flex-templates/streaming_beam/.gcloudignore diff --git a/dataflow/flex-templates/streaming_beam/.gcloudignore b/dataflow/flex-templates/streaming_beam/.gcloudignore new file mode 100644 index 00000000000..e6babcd3e1a --- /dev/null +++ b/dataflow/flex-templates/streaming_beam/.gcloudignore @@ -0,0 +1,3 @@ +* +!*.py +!requirements.txt \ No newline at end of file From 295167d87a416016885b8d9b31cc40e25a9ad87a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 4 May 2021 11:51:05 -0700 Subject: [PATCH 28/57] update gcloud and optimize docker build --- dataflow/conftest.py | 5 ++++- dataflow/flex-templates/streaming_beam/.dockerignore | 4 ++++ dataflow/flex-templates/streaming_beam/.gcloudignore | 1 + dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 dataflow/flex-templates/streaming_beam/.dockerignore diff --git a/dataflow/conftest.py b/dataflow/conftest.py index e35df34b3ae..6cebfe01bf9 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -335,5 +335,8 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: - subprocess.run(["gcloud", "--version"]) + # Some commands like `gcloud dataflow flex-template` are only available + # in the latest gcloud versions. + subprocess.run(["gcloud", "components", "update", "--quiet"], check=True) + subprocess.run(["gcloud", "version"]) return Utils() diff --git a/dataflow/flex-templates/streaming_beam/.dockerignore b/dataflow/flex-templates/streaming_beam/.dockerignore new file mode 100644 index 00000000000..bd849df68f6 --- /dev/null +++ b/dataflow/flex-templates/streaming_beam/.dockerignore @@ -0,0 +1,4 @@ +# Ignore everything except for Python files and the requirements file. +* +!*.py +!requirements.txt \ No newline at end of file diff --git a/dataflow/flex-templates/streaming_beam/.gcloudignore b/dataflow/flex-templates/streaming_beam/.gcloudignore index e6babcd3e1a..bd849df68f6 100644 --- a/dataflow/flex-templates/streaming_beam/.gcloudignore +++ b/dataflow/flex-templates/streaming_beam/.gcloudignore @@ -1,3 +1,4 @@ +# Ignore everything except for Python files and the requirements file. * !*.py !requirements.txt \ No newline at end of file diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index c94ac7b2741..0fa6d853dfb 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -73,7 +73,7 @@ def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) ) -def test_run_template( +def test_flex_template_run( utils: Utils, bucket_name: str, pubsub_publisher: str, From 89809fe061b1495822010b4a68997bcbed31d533 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 4 May 2021 11:59:09 -0700 Subject: [PATCH 29/57] print gcloud version --- dataflow/conftest.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 6cebfe01bf9..51fceb59ad5 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -28,7 +28,6 @@ REGION = "us-west1" ZONE = "us-west1-b" -CONSOLE_URL = "https://console.cloud.google.com" RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds @@ -48,7 +47,6 @@ def storage_bucket(bucket_name: str) -> str: bucket = storage_client.create_bucket(bucket_unique_name) print(f"storage_bucket: {bucket_unique_name}") - print(f"\t{CONSOLE_URL}/storage/browser/{bucket_unique_name}&project={PROJECT}") yield bucket_unique_name bucket.delete(force=True) @@ -84,9 +82,6 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: topic = publisher_client.create_topic(topic_path) print(f"pubsub_topic: {topic.name}") - print( - f"\t{CONSOLE_URL}/cloudpubsub/topic/detail/{topic.name}&project={project}" - ) yield topic.name # Due to the pinned library dependencies in apache-beam, client @@ -112,9 +107,6 @@ def pubsub_subscription( subscription = subscriber.create_subscription(subscription_path, topic_path) print(f"pubsub_subscription: {subscription.name}") - print( - f"\t{CONSOLE_URL}/cloudpubsub/subscription/detail/{subscription.name}&project={project}" - ) yield subscription.name # Due to the pinned library dependencies in apache-beam, client @@ -184,9 +176,6 @@ def container_image( subprocess.run(cmd, check=True) print(f"container_image: {image_name}") - print( - f"\t{CONSOLE_URL}/gcr/images/{project}/GLOBAL/{image_path}?project={project}" - ) yield image_name cmd = [ @@ -322,21 +311,14 @@ def dataflow_flex_template_run( "utf-8" ) print(stdout) - - print(f"Launched Dataflow template job: {unique_job_name}") - - try: - job_id = json.loads(stdout)["job_id"] - print(f"\t{CONSOLE_URL}/dataflow/jobs/{region}/{job_id}&project={project}") - except: - print(f"\t{CONSOLE_URL}/dataflow/jobs&project={project}") - return job_id + print(f"Launched Dataflow Flex Template job: {unique_job_name}") + return json.loads(stdout)["job_id"] @pytest.fixture(scope="session") def utils() -> Utils: # Some commands like `gcloud dataflow flex-template` are only available # in the latest gcloud versions. - subprocess.run(["gcloud", "components", "update", "--quiet"], check=True) + # subprocess.run(["gcloud", "components", "update", "--quiet"], check=True) subprocess.run(["gcloud", "version"]) return Utils() From 7e5b53203b2cc7f2a0ddda3f7dacc66660637bc7 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 10 May 2021 09:19:14 -0700 Subject: [PATCH 30/57] remove outdated comments --- dataflow/conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 51fceb59ad5..f06b168237c 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -317,8 +317,5 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: - # Some commands like `gcloud dataflow flex-template` are only available - # in the latest gcloud versions. - # subprocess.run(["gcloud", "components", "update", "--quiet"], check=True) subprocess.run(["gcloud", "version"]) return Utils() From e5d5641d8eaa62930782168521bb7bca0f103fc0 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 10 May 2021 09:22:13 -0700 Subject: [PATCH 31/57] udpated test requirements --- dataflow/flex-templates/streaming_beam/requirements-test.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt index 6943b5160fa..01511732bed 100644 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt @@ -1,4 +1,3 @@ -backoff==1.10.0 google-api-python-client==2.1.0 google-cloud-storage==1.38.0 pytest-xdist==2.2.1 From 7ff556263a18eae92d5f49d5433d63a6a9244180 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 10 May 2021 09:39:29 -0700 Subject: [PATCH 32/57] include Dockerfile --- dataflow/flex-templates/streaming_beam/.gcloudignore | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/flex-templates/streaming_beam/.gcloudignore b/dataflow/flex-templates/streaming_beam/.gcloudignore index bd849df68f6..594de3d29c8 100644 --- a/dataflow/flex-templates/streaming_beam/.gcloudignore +++ b/dataflow/flex-templates/streaming_beam/.gcloudignore @@ -1,4 +1,5 @@ # Ignore everything except for Python files and the requirements file. * +!Dockerfile !*.py !requirements.txt \ No newline at end of file From e5bdc4f92a754df4661e527a6db5697cfc91a3f9 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 10 May 2021 11:31:02 -0700 Subject: [PATCH 33/57] fix bigquery dataset names --- dataflow/conftest.py | 8 ++++++-- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index f06b168237c..4fb5b3b04ae 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -57,13 +57,15 @@ def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: bigquery_client = bigquery.Client() dataset = bigquery_client.create_dataset( - bigquery.Dataset(f"{project}.{dataset_name}_{UUID}") + bigquery.Dataset(f"{project}.{dataset_name.replace('-', '_')}_{UUID}") ) print(f"bigquery_dataset: {dataset.full_dataset_id}") yield dataset.full_dataset_id - bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True) + bigquery_client.delete_dataset( + dataset.full_dataset_id.replace(":", "."), delete_contents=True + ) @staticmethod def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: @@ -295,6 +297,7 @@ def dataflow_flex_template_run( region: str = REGION, ) -> str: unique_job_name = f"{job_name}-{UUID}" + print(f"dataflow_job_name: {unique_job_name}") cmd = [ "gcloud", "dataflow", @@ -317,5 +320,6 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: + print(f"Test unique identifier: {UUID}") subprocess.run(["gcloud", "version"]) return Utils() diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 0fa6d853dfb..83d1bbc4e7e 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -43,7 +43,7 @@ def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str: @pytest.fixture(scope="session") def bigquery_dataset(utils: Utils) -> str: - yield from utils.bigquery_dataset(NAME.replace("-", "_")) + yield from utils.bigquery_dataset(NAME) @pytest.fixture(scope="session") From e9da280a1e9d5af89c9d0b3fa55041460b93b2aa Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 10 May 2021 15:14:55 -0700 Subject: [PATCH 34/57] add debugging information on subprocess --- dataflow/conftest.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 4fb5b3b04ae..e155d976731 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -310,12 +310,24 @@ def dataflow_flex_template_run( f"--temp_location=gs://{bucket_name}/temp", ] + [f"--parameters={name}={value}" for name, value in parameters.items()] print(cmd) - stdout = subprocess.run(cmd, check=True, capture_output=True).stdout.decode( - "utf-8" - ) - print(stdout) - print(f"Launched Dataflow Flex Template job: {unique_job_name}") - return json.loads(stdout)["job_id"] + try: + p = subprocess.run(cmd, check=True, capture_output=True) + stdout = p.stdout.decode("utf-8") + stderr = p.stdout.decode("utf-8") + print("--- stderr ---") + print(stderr.decode("utf-8")) + print("--- stdout ---") + print(stdout.decode("utf-8")) + print("--- end ---") + print(f"Launched Dataflow Flex Template job: {unique_job_name}") + return json.loads(stdout)["job_id"] + except subprocess.CalledProcessError as e: + print(e) + print("--- stderr ---") + print(e.stderr.decode("utf-8")) + print("--- stdout ---") + print(e.stdout.decode("utf-8")) + print("--- end ---") @pytest.fixture(scope="session") From f2260a323e709fcf5f0a43b41ec2ef951fbd2f9c Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 10 May 2021 15:37:05 -0700 Subject: [PATCH 35/57] fix gcloud command --- dataflow/conftest.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index e155d976731..ce66f61e744 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -268,6 +268,7 @@ def dataflow_flex_template_build( project: str = PROJECT, template_file: str = "template.json", ) -> str: + # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/build template_gcs_path = f"gs://{bucket_name}/{template_file}" cmd = [ "gcloud", @@ -296,6 +297,7 @@ def dataflow_flex_template_run( project: str = PROJECT, region: str = REGION, ) -> str: + # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run unique_job_name = f"{job_name}-{UUID}" print(f"dataflow_job_name: {unique_job_name}") cmd = [ @@ -307,8 +309,13 @@ def dataflow_flex_template_run( f"--template-file-gcs-location={template_path}", f"--project={project}", f"--region={region}", - f"--temp_location=gs://{bucket_name}/temp", - ] + [f"--parameters={name}={value}" for name, value in parameters.items()] + ] + [ + f"--parameters={name}={value}" + for name, value in { + **parameters, + "temp_location": f"gs://{bucket_name}/temp", + }.items() + ] print(cmd) try: p = subprocess.run(cmd, check=True, capture_output=True) From 0765c3ca1d9d269295ccefccc7fd29d55501ac87 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 11 May 2021 10:59:22 -0700 Subject: [PATCH 36/57] remove redundant decode --- dataflow/conftest.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index ce66f61e744..2c7295995cc 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -319,15 +319,13 @@ def dataflow_flex_template_run( print(cmd) try: p = subprocess.run(cmd, check=True, capture_output=True) - stdout = p.stdout.decode("utf-8") - stderr = p.stdout.decode("utf-8") print("--- stderr ---") - print(stderr.decode("utf-8")) + print(p.stderr.decode("utf-8")) print("--- stdout ---") - print(stdout.decode("utf-8")) + print(p.stdout.decode("utf-8")) print("--- end ---") print(f"Launched Dataflow Flex Template job: {unique_job_name}") - return json.loads(stdout)["job_id"] + return json.loads(p.stdout.decode("utf-8"))["job_id"] except subprocess.CalledProcessError as e: print(e) print("--- stderr ---") From 9b16ba64799d95f5ff4b79f8ccab87c3b6a006cd Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 11 May 2021 11:21:58 -0700 Subject: [PATCH 37/57] fix getting flex template job id --- dataflow/conftest.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 2c7295995cc..789786864cc 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -21,6 +21,7 @@ import uuid import pytest +import yaml # Default options. UUID = uuid.uuid4().hex[0:6] @@ -245,11 +246,8 @@ def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None: @staticmethod def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None: import backoff - from googleapiclient.discovery import build from googleapiclient.errors import HttpError - dataflow = build("dataflow", "v1b3") - @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) def cancel(): # To cancel a dataflow job, we need its ID, not its name. @@ -325,7 +323,7 @@ def dataflow_flex_template_run( print(p.stdout.decode("utf-8")) print("--- end ---") print(f"Launched Dataflow Flex Template job: {unique_job_name}") - return json.loads(p.stdout.decode("utf-8"))["job_id"] + return yaml.safe_load(p.stdout.decode("utf-8"))["job"]["id"] except subprocess.CalledProcessError as e: print(e) print("--- stderr ---") From 4de0d99bb59a538fbe17f42e3892c9b1304ed19e Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 11 May 2021 11:37:16 -0700 Subject: [PATCH 38/57] add pyyaml test depdendency --- dataflow/conftest.py | 3 ++- dataflow/flex-templates/streaming_beam/requirements-test.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 789786864cc..87d9643be38 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -21,7 +21,6 @@ import uuid import pytest -import yaml # Default options. UUID = uuid.uuid4().hex[0:6] @@ -295,6 +294,8 @@ def dataflow_flex_template_run( project: str = PROJECT, region: str = REGION, ) -> str: + import yaml + # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run unique_job_name = f"{job_name}-{UUID}" print(f"dataflow_job_name: {unique_job_name}") diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt index 01511732bed..53cd9882b4e 100644 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt @@ -2,3 +2,4 @@ google-api-python-client==2.1.0 google-cloud-storage==1.38.0 pytest-xdist==2.2.1 pytest==6.2.4 +pyyaml==5.4.1 \ No newline at end of file From 0a1a4ccb04046be610805841773bb2315158c9a0 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 11 May 2021 11:50:33 -0700 Subject: [PATCH 39/57] use stdout/stderr instead of capture_output --- dataflow/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 87d9643be38..ea9e4d1ad4d 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -317,7 +317,12 @@ def dataflow_flex_template_run( ] print(cmd) try: - p = subprocess.run(cmd, check=True, capture_output=True) + # The `capture_output` option was added in Python 3.7, so we must + # pass the `stdout` and `stderr` options explicitly to support 3.6. + # https://docs.python.org/3/library/subprocess.html#subprocess.run + p = subprocess.run( + cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) print("--- stderr ---") print(p.stderr.decode("utf-8")) print("--- stdout ---") From b4392ee0724b4416a5152e1a996c76e9f262c833 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 11 May 2021 11:55:15 -0700 Subject: [PATCH 40/57] reorganized error handling --- dataflow/conftest.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index ea9e4d1ad4d..d2b3d4cb1b6 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -16,6 +16,7 @@ import multiprocessing as mp import os import subprocess +import sys import time from typing import Any, Callable, Dict, Iterable, Optional import uuid @@ -323,20 +324,20 @@ def dataflow_flex_template_run( p = subprocess.run( cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) - print("--- stderr ---") - print(p.stderr.decode("utf-8")) - print("--- stdout ---") - print(p.stdout.decode("utf-8")) - print("--- end ---") + stdout = p.stdout.decode("utf-8") + stderr = p.stderr.decode("utf-8") print(f"Launched Dataflow Flex Template job: {unique_job_name}") - return yaml.safe_load(p.stdout.decode("utf-8"))["job"]["id"] except subprocess.CalledProcessError as e: - print(e) + print(e, file=sys.stderr) + stdout = stdout.decode("utf-8") + stderr = stderr.decode("utf-8") + finally: print("--- stderr ---") - print(e.stderr.decode("utf-8")) + print(stderr) print("--- stdout ---") - print(e.stdout.decode("utf-8")) + print(stdout) print("--- end ---") + return yaml.safe_load(stdout)["job"]["id"] @pytest.fixture(scope="session") From 669fbf200aa745895877f993e4604c66310ab98d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 11 May 2021 15:00:52 -0700 Subject: [PATCH 41/57] retry cancel --- dataflow/conftest.py | 47 +++++++++---------- .../streaming_beam/requirements-test.txt | 1 + 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index d2b3d4cb1b6..5262837ac25 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -225,38 +225,37 @@ def dataflow_job_id_from_job_name( @staticmethod def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None: + import backoff from googleapiclient.discovery import build + from googleapiclient.errors import HttpError dataflow = build("dataflow", "v1b3") - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - print(f"Canceling Dataflow job ID: {job_id}") - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - body={"requestedState": "JOB_STATE_CANCELLED"}, + @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) + def cancel_job(): + # For more info, see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update + print(f"Canceling Dataflow job ID: {job_id}") + request = ( + dataflow.projects() + .jobs() + .update( + projectId=project, + jobId=job_id, + body={"requestedState": "JOB_STATE_CANCELLED"}, + ) ) - ) - request.execute() + request.execute() + + cancel_job() @staticmethod def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None: - import backoff - from googleapiclient.errors import HttpError - - @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) - def cancel(): - # To cancel a dataflow job, we need its ID, not its name. - # If it doesn't, job_id will be equal to None. - job_id = Utils.dataflow_job_id_from_job_name(project, job_name) - if job_id is not None: - Utils.dataflow_jobs_cancel_by_job_id(job_id) - - cancel() + # To cancel a dataflow job, we need its ID, not its name. + # If it doesn't, job_id will be equal to None. + job_id = Utils.dataflow_job_id_from_job_name(project, job_name) + if job_id is not None: + Utils.dataflow_jobs_cancel_by_job_id(job_id) @staticmethod def dataflow_flex_template_build( diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt index 53cd9882b4e..bb00b0295a1 100644 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt @@ -1,3 +1,4 @@ +backoff==1.10.0 google-api-python-client==2.1.0 google-cloud-storage==1.38.0 pytest-xdist==2.2.1 From dab62185a933d51cc532105edd95ea4a5a5c0a16 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 12 May 2021 13:39:17 -0700 Subject: [PATCH 42/57] cancel dataflow job with region --- dataflow/conftest.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 5262837ac25..2802f449908 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -224,7 +224,9 @@ def dataflow_job_id_from_job_name( return None @staticmethod - def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None: + def dataflow_jobs_cancel_by_job_id( + job_id: str, project: str = PROJECT, region: str = REGION + ) -> None: import backoff from googleapiclient.discovery import build from googleapiclient.errors import HttpError @@ -242,6 +244,7 @@ def cancel_job(): .update( projectId=project, jobId=job_id, + location=region, body={"requestedState": "JOB_STATE_CANCELLED"}, ) ) @@ -250,12 +253,14 @@ def cancel_job(): cancel_job() @staticmethod - def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None: + def dataflow_jobs_cancel_by_job_name( + job_name: str, project: str = PROJECT, region: str = REGION + ) -> None: # To cancel a dataflow job, we need its ID, not its name. # If it doesn't, job_id will be equal to None. job_id = Utils.dataflow_job_id_from_job_name(project, job_name) if job_id is not None: - Utils.dataflow_jobs_cancel_by_job_id(job_id) + Utils.dataflow_jobs_cancel_by_job_id(job_id, project, region) @staticmethod def dataflow_flex_template_build( From c5929c70e3d2d94626f9f6456a9f0b80e55fc39e Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 12 May 2021 14:49:57 -0700 Subject: [PATCH 43/57] change cancel to gcloud --- dataflow/conftest.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 2802f449908..4d49016d9fa 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -238,17 +238,27 @@ def cancel_job(): # For more info, see: # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update print(f"Canceling Dataflow job ID: {job_id}") - request = ( - dataflow.projects() - .jobs() - .update( - projectId=project, - jobId=job_id, - location=region, - body={"requestedState": "JOB_STATE_CANCELLED"}, - ) - ) - request.execute() + # request = ( + # dataflow.projects() + # .jobs() + # .update( + # projectId=project, + # jobId=job_id, + # location=region, + # body={"requestedState": "JOB_STATE_CANCELLED"}, + # ) + # ) + # request.execute() + cmd = [ + "gcloud", + f"--project={project}", + "dataflow", + "jobs", + "cancel", + job_id, + f"--region={region}", + ] + subprocess.run(cmd, check=True) cancel_job() From ad1861cff348cd53074a9cca85be90aa3cccca9e Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 12 May 2021 15:20:30 -0700 Subject: [PATCH 44/57] simplify testing functions --- dataflow/conftest.py | 49 ++++++------------- .../streaming_beam/requirements-test.txt | 2 - 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 4d49016d9fa..ccdbc4ad7e5 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -227,40 +227,21 @@ def dataflow_job_id_from_job_name( def dataflow_jobs_cancel_by_job_id( job_id: str, project: str = PROJECT, region: str = REGION ) -> None: - import backoff - from googleapiclient.discovery import build - from googleapiclient.errors import HttpError - - dataflow = build("dataflow", "v1b3") - - @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME) - def cancel_job(): - # For more info, see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update - print(f"Canceling Dataflow job ID: {job_id}") - # request = ( - # dataflow.projects() - # .jobs() - # .update( - # projectId=project, - # jobId=job_id, - # location=region, - # body={"requestedState": "JOB_STATE_CANCELLED"}, - # ) - # ) - # request.execute() - cmd = [ - "gcloud", - f"--project={project}", - "dataflow", - "jobs", - "cancel", - job_id, - f"--region={region}", - ] - subprocess.run(cmd, check=True) - - cancel_job() + print(f"Canceling Dataflow job ID: {job_id}") + # We get an error using the googleapiclient.discovery APIs, probably + # due to incompatible dependencies with apache-beam. + # We use gcloud instead to cancel the job. + # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel + cmd = [ + "gcloud", + f"--project={project}", + "dataflow", + "jobs", + "cancel", + job_id, + f"--region={region}", + ] + subprocess.run(cmd, check=True) @staticmethod def dataflow_jobs_cancel_by_job_name( diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt index bb00b0295a1..3a98275ace3 100644 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt @@ -1,5 +1,3 @@ -backoff==1.10.0 -google-api-python-client==2.1.0 google-cloud-storage==1.38.0 pytest-xdist==2.2.1 pytest==6.2.4 From 6a81184df743be00a18999a1a5946cfc44d32a7b Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:16:02 -0700 Subject: [PATCH 45/57] Update dataflow/__init__.py Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> --- dataflow/__init__.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/dataflow/__init__.py b/dataflow/__init__.py index ffc78f34e19..8b137891791 100644 --- a/dataflow/__init__.py +++ b/dataflow/__init__.py @@ -1,11 +1 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + From 212a5ca99822c4bf97ed00a84a4bbd3b72b3acb6 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:16:10 -0700 Subject: [PATCH 46/57] Update dataflow/flex-templates/streaming_beam/__init__.py Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> --- dataflow/flex-templates/streaming_beam/__init__.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/__init__.py b/dataflow/flex-templates/streaming_beam/__init__.py index ffc78f34e19..8b137891791 100644 --- a/dataflow/flex-templates/streaming_beam/__init__.py +++ b/dataflow/flex-templates/streaming_beam/__init__.py @@ -1,11 +1 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + From 9085b1d8c24c182dc0237b09443845904bb1abb9 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:16:21 -0700 Subject: [PATCH 47/57] Update dataflow/flex-templates/__init__.py Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> --- dataflow/flex-templates/__init__.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py index ffc78f34e19..8b137891791 100644 --- a/dataflow/flex-templates/__init__.py +++ b/dataflow/flex-templates/__init__.py @@ -1,11 +1 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + From 0fdc12714e8d5a2326f4746523ffda5e23f91a49 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:17:38 -0700 Subject: [PATCH 48/57] Update dataflow/flex-templates/streaming_beam/noxfile_config.py Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> --- dataflow/flex-templates/streaming_beam/noxfile_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dataflow/flex-templates/streaming_beam/noxfile_config.py b/dataflow/flex-templates/streaming_beam/noxfile_config.py index 79bccdd3e16..9d0a10cec91 100644 --- a/dataflow/flex-templates/streaming_beam/noxfile_config.py +++ b/dataflow/flex-templates/streaming_beam/noxfile_config.py @@ -32,6 +32,10 @@ # to use your own Cloud project. "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, # A dictionary you want to inject into your test. Don't put any # secrets here. These values will override predefined values. "envs": { From 29f7d628c2072972c5bc764afe43c1020d14d162 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:25:11 -0700 Subject: [PATCH 49/57] Update __init__.py From 20deb574faab28ab76294bc2d53fbc791011bf95 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:54:39 -0700 Subject: [PATCH 50/57] Make __init__.py empty From 74b8779e1c8607af10ac03fff7cedbf8309cb099 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 21 May 2021 11:57:23 -0700 Subject: [PATCH 51/57] make __init__ files actually empty --- dataflow/__init__.py | 1 - dataflow/flex-templates/streaming_beam/__init__.py | 1 - 2 files changed, 2 deletions(-) diff --git a/dataflow/__init__.py b/dataflow/__init__.py index 8b137891791..e69de29bb2d 100644 --- a/dataflow/__init__.py +++ b/dataflow/__init__.py @@ -1 +0,0 @@ - diff --git a/dataflow/flex-templates/streaming_beam/__init__.py b/dataflow/flex-templates/streaming_beam/__init__.py index 8b137891791..e69de29bb2d 100644 --- a/dataflow/flex-templates/streaming_beam/__init__.py +++ b/dataflow/flex-templates/streaming_beam/__init__.py @@ -1 +0,0 @@ - From e2cd129ce729f842a9e330afcd77fdd746e9d4be Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 1 Jun 2021 16:53:53 -0700 Subject: [PATCH 52/57] wait for job before cancel --- dataflow/conftest.py | 25 +++++++++++++++++++ .../flex-templates/streaming_beam/e2e_test.py | 9 ++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index ccdbc4ad7e5..e402554023f 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -223,6 +223,31 @@ def dataflow_job_id_from_job_name( return job["id"] return None + @staticmethod + def dataflow_jobs_wait( + job_id: str, + project: str = PROJECT, + status: str = "JOB_STATE_RUNNING", + ) -> bool: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + + sleep_time_seconds = 30 + max_sleep_time = 10 * 60 + print(f"Waiting for Dataflow job ID: {job_id} (until status {status})") + for _ in range(0, max_sleep_time, sleep_time_seconds): + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get + jobs_request = ( + dataflow.projects().jobs().get(projectId=project, jobId=job_id) + ) + response = jobs_request.execute() + if response["currentState"] == status: + return True + time.sleep(30) + return False + @staticmethod def dataflow_jobs_cancel_by_job_id( job_id: str, project: str = PROJECT, region: str = REGION diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 83d1bbc4e7e..e642306ed4b 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -94,11 +94,14 @@ def test_flex_template_run( ) # Since this is a streaming job, it will never finish running. - # Wait for 10 minutes, and then cancel the job. - time.sleep(10 * 60) + # First, lets wait until the job is running. + utils.dataflow_jobs_wait(job_id) + + # Then, wait a minute for data to arrive, get processed, and cancel it. + time.sleep(60) utils.dataflow_jobs_cancel_by_job_id(job_id) - # Check for output data in BigQuery. + # Check for the output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" rows = list(utils.bigquery_query(query)) assert len(rows) > 0 From 0bd17076b0d9b487854ee768d289d26d5db6d162 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 1 Jun 2021 17:56:05 -0700 Subject: [PATCH 53/57] add api client libraries --- dataflow/flex-templates/streaming_beam/requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt index 3a98275ace3..53cd9882b4e 100644 --- a/dataflow/flex-templates/streaming_beam/requirements-test.txt +++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt @@ -1,3 +1,4 @@ +google-api-python-client==2.1.0 google-cloud-storage==1.38.0 pytest-xdist==2.2.1 pytest==6.2.4 From 373ffb18717ac7074baf32d3eb7335b5ca230c73 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 1 Jun 2021 19:18:31 -0700 Subject: [PATCH 54/57] sleep before waiting for job --- dataflow/conftest.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index e402554023f..daad3531caf 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -235,6 +235,12 @@ def dataflow_jobs_wait( sleep_time_seconds = 30 max_sleep_time = 10 * 60 + + # It takes a couple seconds for the job_id to be findable by the API client. + # Sleep for a small duration initially to wait until we can access the job + # from the client library. + time.sleep(sleep_time_seconds) + print(f"Waiting for Dataflow job ID: {job_id} (until status {status})") for _ in range(0, max_sleep_time, sleep_time_seconds): # For more info see: @@ -245,7 +251,7 @@ def dataflow_jobs_wait( response = jobs_request.execute() if response["currentState"] == status: return True - time.sleep(30) + time.sleep(sleep_time_seconds) return False @staticmethod From ca118e11d4dd0deec2dc53ea2781b815db4ef194 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 1 Jun 2021 19:38:25 -0700 Subject: [PATCH 55/57] add more logging --- dataflow/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index daad3531caf..6beebdeea42 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -246,9 +246,12 @@ def dataflow_jobs_wait( # For more info see: # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get jobs_request = ( - dataflow.projects().jobs().get(projectId=project, jobId=job_id) + dataflow.projects() + .jobs() + .get(projectId=project, jobId=job_id, jobView="JOB_VIEW_SUMMARY") ) response = jobs_request.execute() + print(response) if response["currentState"] == status: return True time.sleep(sleep_time_seconds) From a07b2f6f4849c54932b1a8db9a5e0d3b18c1291a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 2 Jun 2021 10:08:26 -0700 Subject: [PATCH 56/57] fix parameter name --- dataflow/conftest.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 6beebdeea42..8473402c53d 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -248,7 +248,11 @@ def dataflow_jobs_wait( jobs_request = ( dataflow.projects() .jobs() - .get(projectId=project, jobId=job_id, jobView="JOB_VIEW_SUMMARY") + .get( + projectId=project, + jobId=job_id, + view="JOB_VIEW_SUMMARY", + ) ) response = jobs_request.execute() print(response) From 6c9bcc329f113dcf0084cc5a4554b030f1afa06d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 2 Jun 2021 11:23:48 -0700 Subject: [PATCH 57/57] surround wait in try-except --- dataflow/conftest.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 8473402c53d..13314bf86dd 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -236,28 +236,26 @@ def dataflow_jobs_wait( sleep_time_seconds = 30 max_sleep_time = 10 * 60 - # It takes a couple seconds for the job_id to be findable by the API client. - # Sleep for a small duration initially to wait until we can access the job - # from the client library. - time.sleep(sleep_time_seconds) - print(f"Waiting for Dataflow job ID: {job_id} (until status {status})") for _ in range(0, max_sleep_time, sleep_time_seconds): - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get - jobs_request = ( - dataflow.projects() - .jobs() - .get( - projectId=project, - jobId=job_id, - view="JOB_VIEW_SUMMARY", + try: + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get + jobs_request = ( + dataflow.projects() + .jobs() + .get( + projectId=project, + jobId=job_id, + view="JOB_VIEW_SUMMARY", + ) ) - ) - response = jobs_request.execute() - print(response) - if response["currentState"] == status: - return True + response = jobs_request.execute() + print(response) + if response["currentState"] == status: + return True + except: + pass time.sleep(sleep_time_seconds) return False