From 90c273008d8a4a3f8d884bc9c38a346be3ffa43a Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 13 Apr 2021 15:32:58 -0700
Subject: [PATCH 01/57] dataflow: update Dockerfile and modularize tests

---
 .../flex-templates/streaming_beam/Dockerfile  |  27 +--
 .../flex-templates/streaming_beam/e2e_test.py | 112 ++++++++++
 .../flex-templates/streaming_beam/noxfile.py  | 171 --------------
 .../streaming_beam/noxfile_config.py          |  40 ++++
 .../streaming_beam/requirements-test.txt      |   1 -
 .../streaming_beam/streaming_beam.py          | 121 +++++-----
 .../streaming_beam/streaming_beam_test.py     | 165 --------------
 dataflow/requirements-test.txt                |   4 +
 dataflow/testing_utils.py                     | 210 ++++++++++++++++++
 9 files changed, 449 insertions(+), 402 deletions(-)
 create mode 100644 dataflow/flex-templates/streaming_beam/e2e_test.py
 delete mode 100644 dataflow/flex-templates/streaming_beam/noxfile.py
 create mode 100644 dataflow/flex-templates/streaming_beam/noxfile_config.py
 delete mode 100644 dataflow/flex-templates/streaming_beam/requirements-test.txt
 delete mode 100644 dataflow/flex-templates/streaming_beam/streaming_beam_test.py
 create mode 100644 dataflow/requirements-test.txt
 create mode 100644 dataflow/testing_utils.py

diff --git a/dataflow/flex-templates/streaming_beam/Dockerfile b/dataflow/flex-templates/streaming_beam/Dockerfile
index 554720eee96..02f346957af 100644
--- a/dataflow/flex-templates/streaming_beam/Dockerfile
+++ b/dataflow/flex-templates/streaming_beam/Dockerfile
@@ -14,19 +14,20 @@
 
 FROM gcr.io/dataflow-templates-base/python3-template-launcher-base
 
-ARG WORKDIR=/dataflow/template
-RUN mkdir -p ${WORKDIR}
-WORKDIR ${WORKDIR}
+ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="/template/requirements.txt"
+ENV FLEX_TEMPLATE_PYTHON_PY_FILE="/template/streaming_beam.py"
 
-# Due to a change in the Beam base image in version 2.24, we need to install
-# libffi-dev manually as a dependency. For more information:
-#   https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4891
-RUN apt-get update && apt-get install -y libffi-dev git && rm -rf /var/lib/apt/lists/*
+COPY . /template
 
-COPY requirements.txt .
-COPY streaming_beam.py .
+# We could get rid of installing libffi-dev and git, or we could leave them.
+RUN apt-get update \
+    && apt-get install -y libffi-dev git \
+    && rm -rf /var/lib/apt/lists/* \
+    # Upgrade pip and install the requirements.
+    && pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r $FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE \
+    # Download the requirements to speed up launching the Dataflow job.
+    && pip download --no-cache-dir --dest /tmp/dataflow-requirements-cache -r $FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE
 
-ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt"
-ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/streaming_beam.py"
-
-RUN pip install -U -r ./requirements.txt
+# Since we already downloaded all the dependencies, there's no need to rebuild everything.
+ENV PIP_NO_DEPS=True
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
new file mode 100644
index 00000000000..ded698e5e30
--- /dev/null
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -0,0 +1,112 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import subprocess
+import time
+import uuid
+
+from google.cloud import bigquery
+import pytest
+
+from . import testing_utils
+
+
+SUFFIX = uuid.uuid4().hex[0:6]
+PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
+BUCKET_NAME = f"flex-templates-streaming-beam-{SUFFIX}"
+BIGQUERY_DATASET = f"flex_templates_{SUFFIX}"
+BIGQUERY_TABLE = "streaming_beam"
+TOPIC = f"flex-templates-streaming-beam-{SUFFIX}"
+SUBSCRIPTION = TOPIC
+IMAGE_NAME = f"gcr.io/{PROJECT}/dataflow/flex-templates/streaming-beam-{SUFFIX}:latest"
+TEMPLATE_FILE = "template.json"
+REGION = "us-central1"
+
+
+@pytest.fixture(scope="session")
+def bucket_name() -> str:
+    return testing_utils.storage_bucket(BUCKET_NAME)
+
+
+@pytest.fixture(scope="session")
+def topic_path() -> str:
+    return testing_utils.pubsub_topic(PROJECT, TOPIC)
+
+
+@pytest.fixture(scope="session")
+def subscription_path(topic_path: str) -> str:
+    return testing_utils.pubsub_subscription(PROJECT, topic_path, SUBSCRIPTION)
+
+
+@pytest.fixture(scope="session")
+def bigquery_dataset() -> str:
+    return testing_utils.bigquery_dataset(PROJECT, BIGQUERY_DATASET)
+
+
+@pytest.fixture(scope="session")
+def publisher(topic_path: str) -> bool:
+    return testing_utils.pubsub_publisher(topic_path)
+
+
+@pytest.fixture(scope="session")
+def template_image() -> str:
+    return testing_utils.container_image(PROJECT, IMAGE_NAME)
+
+
+@pytest.fixture(scope="session")
+def template_path(bucket_name: str, template_image: str) -> str:
+    return testing_utils.dataflow_flex_template_build(
+        bucket_name=bucket_name,
+        template_file=TEMPLATE_FILE,
+        template_image=template_image,
+        metadata_file="metadata.json",
+    )
+
+
+def test_run_template(
+    publisher: str,
+    bucket_name: str,
+    template_path: str,
+    dataset: str,
+    subscription_path: str,
+) -> None:
+
+    job_name = f"flex-templates-streaming-beam-{SUFFIX}"
+    subprocess.call(
+        [
+            "gcloud",
+            "dataflow",
+            "flex-template",
+            "run",
+            job_name,
+            f"--template-file-gcs-location={template_path}",
+            f"--temp_location=gs://{bucket_name}/temp",
+            f"--parameters=input_subscription={subscription_path}",
+            f"--parameters=output_table={dataset}.{BIGQUERY_TABLE}",
+            f"--region={REGION}",
+        ],
+        check=True,
+    )
+
+    # Wait for 10 minutes, and then cancel the job.
+    time.sleep(10 * 60)
+    testing_utils.dataflow_jobs_cancel(PROJECT, job_name)
+
+    # Check for output data in BigQuery.
+    bigquery_client = bigquery.Client()
+    query = f"SELECT * FROM {PROJECT}.{BIGQUERY_DATASET}.{BIGQUERY_TABLE}"
+    query_job = bigquery_client.query(query)
+    rows = query_job.result()
+    assert rows.total_rows > 0
+    for row in rows:
+        assert row["score"] == 1
diff --git a/dataflow/flex-templates/streaming_beam/noxfile.py b/dataflow/flex-templates/streaming_beam/noxfile.py
deleted file mode 100644
index c917ea77ced..00000000000
--- a/dataflow/flex-templates/streaming_beam/noxfile.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-from pathlib import Path
-
-import nox
-
-
-# DO NOT EDIT - automatically generated.
-# All versions used to tested samples.
-ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8", "3.9"]
-
-# Any default versions that should be ignored.
-IGNORED_VERSIONS = ["2.7", "3.8", "3.9"]
-
-TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
-
-#
-# Style Checks
-#
-
-
-def _determine_local_import_names(start_dir):
-    """Determines all import names that should be considered "local".
-
-    This is used when running the linter to insure that import order is
-    properly checked.
-    """
-    file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)]
-    return [
-        basename
-        for basename, extension in file_ext_pairs
-        if extension == ".py"
-        or os.path.isdir(os.path.join(start_dir, basename))
-        and basename not in ("__pycache__")
-    ]
-
-
-# Linting with flake8.
-#
-# We ignore the following rules:
-#   E203: whitespace before ‘:’
-#   E266: too many leading ‘#’ for block comment
-#   E501: line too long
-#   I202: Additional newline in a section of imports
-#
-# We also need to specify the rules which are ignored by default:
-# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
-FLAKE8_COMMON_ARGS = [
-    "--show-source",
-    "--builtin=gettext",
-    "--max-complexity=20",
-    "--import-order-style=google",
-    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
-    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
-    "--max-line-length=88",
-]
-
-
-@nox.session
-def lint(session):
-    session.install("flake8", "flake8-import-order")
-
-    local_names = _determine_local_import_names(".")
-    args = FLAKE8_COMMON_ARGS + [
-        "--application-import-names",
-        ",".join(local_names),
-        ".",
-    ]
-    session.run("flake8", *args)
-
-
-#
-# Black
-#
-
-@nox.session
-def blacken(session):
-    session.install("black")
-    python_files = [path for path in os.listdir(".") if path.endswith(".py")]
-
-    session.run("black", *python_files)
-
-
-#
-# Sample Tests
-#
-
-
-PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
-
-
-def _session_tests(session, post_install=None):
-    """Runs py.test for a particular project."""
-    if os.path.exists("requirements.txt"):
-        session.install("-r", "requirements.txt")
-
-    if os.path.exists("requirements-test.txt"):
-        session.install("-r", "requirements-test.txt")
-
-    if post_install:
-        post_install(session)
-
-    session.run(
-        "pytest",
-        *(PYTEST_COMMON_ARGS + session.posargs),
-        # Pytest will return 5 when no tests are collected. This can happen
-        # on travis where slow and flaky tests are excluded.
-        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
-        success_codes=[0, 5]
-    )
-
-
-@nox.session(python=ALL_VERSIONS)
-def py(session):
-    """Runs py.test for a sample using the specified version of Python."""
-    if session.python in TESTED_VERSIONS:
-        _session_tests(session)
-    else:
-        print("SKIPPED: {} tests are disabled for this sample.".format(session.python))
-
-
-#
-# Readmegen
-#
-
-
-def _get_repo_root():
-    """ Returns the root folder of the project. """
-    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
-    p = Path(os.getcwd())
-    for i in range(10):
-        if p is None:
-            break
-        if Path(p / ".git").exists():
-            return str(p)
-        p = p.parent
-    raise Exception("Unable to detect repository root.")
-
-
-GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
-
-
-@nox.session
-@nox.parametrize("path", GENERATED_READMES)
-def readmegen(session, path):
-    """(Re-)generates the readme for a sample."""
-    session.install("jinja2", "pyyaml")
-    dir_ = os.path.dirname(path)
-
-    if os.path.exists(os.path.join(dir_, "requirements.txt")):
-        session.install("-r", os.path.join(dir_, "requirements.txt"))
-
-    in_file = os.path.join(dir_, "README.rst.in")
-    session.run(
-        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
-    )
diff --git a/dataflow/flex-templates/streaming_beam/noxfile_config.py b/dataflow/flex-templates/streaming_beam/noxfile_config.py
new file mode 100644
index 00000000000..b6ba946dcbc
--- /dev/null
+++ b/dataflow/flex-templates/streaming_beam/noxfile_config.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Default TEST_CONFIG_OVERRIDE for python repos.
+
+# You can copy this file into your directory, then it will be inported from
+# the noxfile.py.
+
+# The source of truth:
+# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py
+
+TEST_CONFIG_OVERRIDE = {
+    # You can opt out from the test for specific Python versions.
+    "ignored_versions": ["2.7"],
+    # Old samples are opted out of enforcing Python type hints
+    # All new samples should feature them
+    "enforce_type_hints": True,
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
+    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    "envs": {
+        "PYTEST_ADDOPTS": "-n=8",  # parallelize tests in multiple CPUs
+    },
+}
diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
deleted file mode 100644
index d5bd56fd179..00000000000
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest==6.2.1
diff --git a/dataflow/flex-templates/streaming_beam/streaming_beam.py b/dataflow/flex-templates/streaming_beam/streaming_beam.py
index af1321e8e18..ab2ecfc9087 100644
--- a/dataflow/flex-templates/streaming_beam/streaming_beam.py
+++ b/dataflow/flex-templates/streaming_beam/streaming_beam.py
@@ -24,82 +24,99 @@
 import json
 import logging
 import time
+from typing import Any, Dict, List
 
 import apache_beam as beam
 from apache_beam.options.pipeline_options import PipelineOptions
 import apache_beam.transforms.window as window
 
 # Defines the BigQuery schema for the output table.
-SCHEMA = ','.join([
-    'url:STRING',
-    'num_reviews:INTEGER',
-    'score:FLOAT64',
-    'first_date:TIMESTAMP',
-    'last_date:TIMESTAMP',
-])
-
-
-def parse_json_message(message):
+SCHEMA = ",".join(
+    [
+        "url:STRING",
+        "num_reviews:INTEGER",
+        "score:FLOAT64",
+        "first_date:TIMESTAMP",
+        "last_date:TIMESTAMP",
+    ]
+)
+
+
+def parse_json_message(message: str) -> Dict[str, Any]:
     """Parse the input json message and add 'score' & 'processing_time' keys."""
     row = json.loads(message)
     return {
-        'url': row['url'],
-        'score': 1.0 if row['review'] == 'positive' else 0.0,
-        'processing_time': int(time.time()),
-    }
-
-
-def get_statistics(url_messages):
-    """Get statistics from the input URL messages."""
-    url, messages = url_messages
-    return {
-        'url': url,
-        'num_reviews': len(messages),
-        'score': sum(msg['score'] for msg in messages) / len(messages),
-        'first_date': min(msg['processing_time'] for msg in messages),
-        'last_date': max(msg['processing_time'] for msg in messages),
+        "url": row["url"],
+        "score": 1.0 if row["review"] == "positive" else 0.0,
+        "processing_time": int(time.time()),
     }
 
 
-def run(args, input_subscription, output_table, window_interval):
+def run(
+    input_subscription: str,
+    output_table: str,
+    window_interval_sec: int = 60,
+    beam_args: List[str] = None,
+) -> None:
     """Build and run the pipeline."""
-    options = PipelineOptions(args, save_main_session=True, streaming=True)
+    options = PipelineOptions(beam_args, save_main_session=True, streaming=True)
 
     with beam.Pipeline(options=options) as pipeline:
-
-        # Read the messages from PubSub and process them.
         messages = (
             pipeline
-            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
-                subscription=input_subscription).with_output_types(bytes)
-            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
-            | 'Parse JSON messages' >> beam.Map(parse_json_message)
-            | 'Fixed-size windows' >> beam.WindowInto(
-                window.FixedWindows(int(window_interval), 0))
-            | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
-            | 'Group by URLs' >> beam.GroupByKey()
-            | 'Get statistics' >> beam.Map(get_statistics))
+            | "Read from Pub/Sub"
+            >> beam.io.ReadFromPubSub(
+                subscription=input_subscription
+            ).with_output_types(bytes)
+            | "UTF-8 bytes to string" >> beam.Map(lambda msg: msg.decode("utf-8"))
+            | "Parse JSON messages" >> beam.Map(parse_json_message)
+            | "Fixed-size windows"
+            >> beam.WindowInto(window.FixedWindows(window_interval_sec, 0))
+            | "Add URL keys" >> beam.WithKeys(lambda msg: msg["url"])
+            | "Group by URLs" >> beam.GroupByKey()
+            | "Get statistics"
+            >> beam.MapTuple(
+                lambda url, messages: {
+                    "url": url,
+                    "num_reviews": len(messages),
+                    "score": sum(msg["score"] for msg in messages) / len(messages),
+                    "first_date": min(msg["processing_time"] for msg in messages),
+                    "last_date": max(msg["processing_time"] for msg in messages),
+                }
+            )
+        )
 
         # Output the results into BigQuery table.
-        _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
-            output_table, schema=SCHEMA)
+        _ = messages | "Write to Big Query" >> beam.io.WriteToBigQuery(
+            output_table, schema=SCHEMA
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     logging.getLogger().setLevel(logging.INFO)
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--output_table',
-        help='Output BigQuery table for results specified as: '
-        'PROJECT:DATASET.TABLE or DATASET.TABLE.')
+        "--output_table",
+        help="Output BigQuery table for results specified as: "
+        "PROJECT:DATASET.TABLE or DATASET.TABLE.",
+    )
     parser.add_argument(
-        '--input_subscription',
-        help='Input PubSub subscription of the form '
-        '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')
+        "--input_subscription",
+        help="Input PubSub subscription of the form "
+        '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."',
+    )
     parser.add_argument(
-        '--window_interval',
+        "--window_interval_sec",
         default=60,
-        help='Window interval in seconds for grouping incoming messages.')
-    known_args, pipeline_args = parser.parse_known_args()
-    run(pipeline_args, known_args.input_subscription, known_args.output_table,
-        known_args.window_interval)
+        type=int,
+        help="Window interval in seconds for grouping incoming messages.",
+    )
+    args, beam_args = parser.parse_known_args()
+
+    run(
+        input_subscription=args.input_subscription,
+        output_table=args.output_table,
+        window_interval_sec=args.window_interval_sec,
+        beam_args=beam_args,
+    )
diff --git a/dataflow/flex-templates/streaming_beam/streaming_beam_test.py b/dataflow/flex-templates/streaming_beam/streaming_beam_test.py
deleted file mode 100644
index d588adf432f..00000000000
--- a/dataflow/flex-templates/streaming_beam/streaming_beam_test.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-import multiprocessing as mp
-import os
-import subprocess as sp
-import tempfile
-import time
-import uuid
-
-from google.cloud import bigquery
-from google.cloud import pubsub
-import pytest
-
-
-PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
-UUID = str(uuid.uuid4()).split('-')[0]
-DATASET = 'beam_samples_{}'.format(UUID)
-TABLE = 'streaming_beam_sql'
-TOPIC = 'messages-{}'.format(UUID)
-SUBSCRIPTION = TOPIC
-
-
-@pytest.fixture
-def topic_path():
-    publisher_client = pubsub.PublisherClient()
-    topic_path = publisher_client.topic_path(PROJECT, TOPIC)
-    try:
-        publisher_client.delete_topic(topic_path)
-    except Exception:
-        pass
-    topic = publisher_client.create_topic(topic_path)
-    yield topic.name
-    # Due to the pinned library dependencies in apache-beam, client
-    # library throws an error upon deletion.
-    # We use gcloud for a workaround. See also:
-    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-    sp.check_call(
-        ['gcloud', 'pubsub', '--project', PROJECT, 'topics', 'delete', TOPIC])
-
-
-@pytest.fixture
-def subscription_path(topic_path):
-    subscriber = pubsub.SubscriberClient()
-    subscription_path = subscriber.subscription_path(PROJECT, SUBSCRIPTION)
-    try:
-        subscriber.delete_subscription(subscription_path)
-    except Exception:
-        pass
-    subscription = subscriber.create_subscription(subscription_path, topic_path)
-    yield subscription.name
-
-    # Due to the pinned library dependencies in apache-beam, client
-    # library throws an error upon deletion.
-    # We use gcloud for a workaround. See also:
-    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-    sp.check_call(
-        ['gcloud', 'pubsub', '--project', PROJECT, 'subscriptions', 'delete',
-         SUBSCRIPTION])
-
-
-@pytest.fixture
-def dataset():
-    bigquery_client = bigquery.Client(project=PROJECT)
-    dataset_id = '{}.{}'.format(PROJECT, DATASET)
-    dataset = bigquery.Dataset(dataset_id)
-    dataset = bigquery_client.create_dataset(dataset, exists_ok=True)
-    yield '{}:{}'.format(PROJECT, DATASET)
-    bigquery_client.delete_table('{}.{}'.format(DATASET, TABLE), not_found_ok=True)
-    bigquery_client.delete_dataset(DATASET, not_found_ok=True)
-
-
-def _infinite_publish_job(topic_path):
-    publisher_client = pubsub.PublisherClient()
-    while True:
-        future = publisher_client.publish(
-            topic_path,
-            b'{"url": "https://beam.apache.org/", "review": "positive"}')
-        future.result()
-        time.sleep(1)
-
-
-def test_dataflow_flex_templates_pubsub_to_bigquery(dataset, topic_path,
-                                                    subscription_path):
-    # Use one process to publish messages to a topic.
-    publish_process = mp.Process(target=lambda: _infinite_publish_job(topic_path))
-
-    # Use another process to run the streaming pipeline that should write one
-    # row to BigQuery every minute (according to the default window size).
-    pipeline_process = mp.Process(target=lambda: sp.call([
-        'python', 'streaming_beam.py',
-        '--project', PROJECT,
-        '--runner', 'DirectRunner',
-        '--temp_location', tempfile.mkdtemp(),
-        '--input_subscription', subscription_path,
-        '--output_table', '{}.{}'.format(dataset, TABLE),
-        '--window_interval', '5',
-    ]))
-
-    publish_process.start()
-    pipeline_process.start()
-
-    pipeline_process.join(timeout=30)
-    publish_process.join(timeout=0)
-
-    pipeline_process.terminate()
-    publish_process.terminate()
-
-    # Check for output data in BigQuery.
-    bigquery_client = bigquery.Client(project=PROJECT)
-    query = 'SELECT * FROM {}.{}'.format(DATASET, TABLE)
-    query_job = bigquery_client.query(query)
-    rows = query_job.result()
-    assert rows.total_rows > 0
-    for row in rows:
-        assert row['score'] == 1
-
-
-# TODO:Testcase using Teststream currently does not work as intended.
-# The first write to BigQuery fails. Have filed a bug. The test case
-# to be changed once the bug gets fixed.  b/152446921
-'''
-@mock.patch("apache_beam.Pipeline", TestPipeline)
-@mock.patch(
-    "apache_beam.io.ReadFromPubSub",
-    lambda subscription: (
-        TestStream()
-        .advance_watermark_to(0)
-        .advance_processing_time(60)
-        .add_elements([TimestampedValue(
-            b'{"url": "https://beam.apache.org/", "review": "positive"}',
-                1575937195)])
-        .advance_processing_time(60)
-        .add_elements([TimestampedValue(
-            b'{"url": "https://beam.apache.org/", "review": "positive"}',
-                1575937255)])
-        .advance_watermark_to_infinity()
-    ),
-)
-def test_dataflow_flex_templates_pubsub_to_bigquery(dataset):
-    streaming_beam.run(
-        args=[
-            "--project", PROJECT,
-            "--runner", "DirectRunner"
-        ],
-        input_subscription="unused",
-        output_table='{}:{}.{}'.format(PROJECT, DATASET, TABLE),
-    )
-
-    # Check for output data in BigQuery.
-    bigquery_client = bigquery.Client(project=PROJECT)
-    query = 'SELECT * FROM {}.{}'.format(DATASET, TABLE)
-    query_job = bigquery_client.query(query)
-    rows = query_job.result()
-    assert rows.total_rows > 0
-'''
diff --git a/dataflow/requirements-test.txt b/dataflow/requirements-test.txt
new file mode 100644
index 00000000000..16def8b7d52
--- /dev/null
+++ b/dataflow/requirements-test.txt
@@ -0,0 +1,4 @@
+backoff==1.10.0
+google-api-python-client==2.1.0
+pytest-xdist==2.2.1
+pytest==6.2.1
diff --git a/dataflow/testing_utils.py b/dataflow/testing_utils.py
new file mode 100644
index 00000000000..f0cedcec36e
--- /dev/null
+++ b/dataflow/testing_utils.py
@@ -0,0 +1,210 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from datetime import time
+import itertools
+import subprocess
+import multiprocessing as mp
+from typing import Callable, List, Optional
+
+import backoff
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from google.cloud import bigquery
+from google.cloud import pubsub
+from google.cloud import storage
+
+dataflow = build("dataflow", "v1b3")
+
+
+RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
+
+
+def storage_bucket(bucket_name: str) -> str:
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(bucket_name)
+
+    print(f"storage_bucket: {repr(bucket_name)}")
+    yield bucket_name
+
+    bucket.delete(force=True)
+
+
+def bigquery_dataset(project: str, dataset_name: str) -> str:
+    bigquery_client = bigquery.Client()
+    dataset = bigquery.Dataset(f"{project}.{dataset_name}")
+    dataset = bigquery_client.create_dataset(dataset)
+
+    print(f"bigquery_dataset: {dataset.full_dataset_id}")
+    yield dataset.full_dataset_id
+
+    bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
+
+
+def pubsub_topic(project: str, topic_name: str) -> str:
+    publisher_client = pubsub.PublisherClient()
+    topic_path = publisher_client.topic_path(project, topic_name)
+    topic = publisher_client.create_topic(topic_path)
+
+    print(f"pubsub_topic: {repr(topic.name)}")
+    yield topic.name
+
+    # Due to the pinned library dependencies in apache-beam, client
+    # library throws an error upon deletion.
+    # We use gcloud for a workaround. See also:
+    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+    subprocess.check_call(
+        ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
+        check=True,
+    )
+
+
+def pubsub_subscription(project: str, topic_path: str, subscription_name: str) -> str:
+    subscriber = pubsub.SubscriberClient()
+    subscription_path = subscriber.subscription_path(project, subscription_name)
+    subscription = subscriber.create_subscription(subscription_path, topic_path)
+
+    print(f"pubsub_subscription: {repr(subscription.name)}")
+    yield subscription.name
+
+    # Due to the pinned library dependencies in apache-beam, client
+    # library throws an error upon deletion.
+    # We use gcloud for a workaround. See also:
+    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+    subprocess.check_call(
+        [
+            "gcloud",
+            "pubsub",
+            "--project",
+            project,
+            "subscriptions",
+            "delete",
+            subscription_name,
+        ],
+        check=True,
+    )
+
+
+def pubsub_publisher(topic_path: str, new_msg: Callable[[int], str]) -> bool:
+    def _infinite_publish_job() -> None:
+        publisher_client = pubsub.PublisherClient()
+        for i in itertools.count():
+            publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result()
+            time.sleep(1)
+
+    # Start a subprocess in the background to do the publishing.
+    p = mp.Process(target=_infinite_publish_job)
+    p.start()
+
+    yield p.is_alive()
+
+    # For cleanup, terminate the background process.
+    p.join(timeout=0)
+    p.terminate()
+
+
+def container_image(project: str, image_name: str) -> str:
+    subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
+    subprocess.run(
+        [
+            "gcloud",
+            "builds",
+            "submit",
+            f"--project={project}",
+            f"--tag={image_name}",
+            ".",
+        ],
+        check=True,
+    )
+
+    yield image_name
+
+    subprocess.run(
+        [
+            "gcloud",
+            "container",
+            "images",
+            "delete",
+            image_name,
+            f"--project={project}",
+            "--quiet",
+        ],
+        check=True,
+    )
+
+
+def dataflow_job_id_from_job_name(project: str, job_name: str) -> Optional[str]:
+    # Only return the 50 most recent results - our job is likely to be in here.
+    # If the job is not found, first try increasing this number.
+    # For more info see:
+    #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
+    jobs_request = (
+        dataflow.projects()
+        .jobs()
+        .list(
+            projectId=project,
+            filter="ACTIVE",
+            pageSize=50,
+        )
+    )
+    response = jobs_request.execute()
+
+    # Search for the job in the list that has our name (names are unique)
+    for job in response["jobs"]:
+        if job["name"] == job_name:
+            return job["id"]
+    return None
+
+
+@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
+def dataflow_jobs_cancel(project: str, job_name: str) -> None:
+    # To cancel a dataflow job, we need its ID, not its name
+    job_id = dataflow_job_id_from_job_name(project, job_name)
+
+    if job_id is not None:
+        # Cancel the Dataflow job if it exists.
+        # If it doesn't, job_id will be equal to None.
+        # For more info, see:
+        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+        request = (
+            dataflow.projects()
+            .jobs()
+            .update(
+                projectId=project,
+                jobId=job_id,
+                body={"requestedState": "JOB_STATE_CANCELLED"},
+            )
+        )
+        request.execute()
+
+
+def dataflow_flex_template_build(
+    bucket_name: str, template_file: str, template_image: str, metadata_file: str
+) -> str:
+    subprocess.call(
+        [
+            "gcloud",
+            "dataflow",
+            "flex-template",
+            "build",
+            f"gs://{bucket_name}/{template_file}",
+            f"--image={template_image}",
+            "--sdk-language=PYTHON",
+            f"--metadata-file={metadata_file}",
+        ],
+        check=True,
+    )
+
+    yield f"gs://{bucket_name}/{template_file}"
+
+    storage_client = storage.Client()
+    storage_client.bucket(bucket_name).blob(template_file).delete()

From bfdd061a36e0ce77a69d88998aba05db3151e091 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 13 Apr 2021 15:36:31 -0700
Subject: [PATCH 02/57] update headers

---
 dataflow/flex-templates/streaming_beam/e2e_test.py       | 2 +-
 dataflow/flex-templates/streaming_beam/noxfile_config.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index ded698e5e30..1b78020fc69 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Google LLC
+# Copyright 2021 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
diff --git a/dataflow/flex-templates/streaming_beam/noxfile_config.py b/dataflow/flex-templates/streaming_beam/noxfile_config.py
index b6ba946dcbc..79bccdd3e16 100644
--- a/dataflow/flex-templates/streaming_beam/noxfile_config.py
+++ b/dataflow/flex-templates/streaming_beam/noxfile_config.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Google LLC
+# Copyright 2021 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 32e50ed2174be7b9c4e76dd8ce0d5845208833dc Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 13 Apr 2021 15:42:22 -0700
Subject: [PATCH 03/57] add requirements-test.txt

---
 .../{ => flex-templates/streaming_beam}/requirements-test.txt     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dataflow/{ => flex-templates/streaming_beam}/requirements-test.txt (100%)

diff --git a/dataflow/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
similarity index 100%
rename from dataflow/requirements-test.txt
rename to dataflow/flex-templates/streaming_beam/requirements-test.txt

From 2dc5ba12d3ecfdb710535b0e1b2bb26cc90b0571 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 13 Apr 2021 15:57:21 -0700
Subject: [PATCH 04/57] enable relative import

---
 dataflow/flex-templates/__init__.py                | 13 +++++++++++++
 dataflow/flex-templates/streaming_beam/e2e_test.py |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 dataflow/flex-templates/__init__.py

diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py
new file mode 100644
index 00000000000..e7ed7703169
--- /dev/null
+++ b/dataflow/flex-templates/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .. import testing_utils
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index 1b78020fc69..d043e52c0b8 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -18,7 +18,7 @@
 from google.cloud import bigquery
 import pytest
 
-from . import testing_utils
+from .. import testing_utils
 
 
 SUFFIX = uuid.uuid4().hex[0:6]

From 5c7a885d8814cbc0260df33126632e4f090427cd Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 13 Apr 2021 16:04:39 -0700
Subject: [PATCH 05/57] add __init__.py

---
 dataflow/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 dataflow/__init__.py

diff --git a/dataflow/__init__.py b/dataflow/__init__.py
new file mode 100644
index 00000000000..ffc78f34e19
--- /dev/null
+++ b/dataflow/__init__.py
@@ -0,0 +1,11 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

From f6027ffeeb2235716894336c0d98ce13a6c52e19 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 14 Apr 2021 13:22:18 -0700
Subject: [PATCH 06/57] add __init__.py

---
 dataflow/flex-templates/streaming_beam/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 dataflow/flex-templates/streaming_beam/__init__.py

diff --git a/dataflow/flex-templates/streaming_beam/__init__.py b/dataflow/flex-templates/streaming_beam/__init__.py
new file mode 100644
index 00000000000..ffc78f34e19
--- /dev/null
+++ b/dataflow/flex-templates/streaming_beam/__init__.py
@@ -0,0 +1,11 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

From 2c6c53ac450e28bb66b06881046171755b536f5a Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 13:36:50 -0700
Subject: [PATCH 07/57] modularized tests with conftest

---
 dataflow/conftest.py                          | 292 ++++++++++++++++++
 .../flex-templates/streaming_beam/e2e_test.py | 105 +++----
 dataflow/testing_utils.py                     | 210 -------------
 3 files changed, 338 insertions(+), 269 deletions(-)
 create mode 100644 dataflow/conftest.py
 delete mode 100644 dataflow/testing_utils.py

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
new file mode 100644
index 00000000000..7995f922e1b
--- /dev/null
+++ b/dataflow/conftest.py
@@ -0,0 +1,292 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from dataclasses import dataclass
+from datetime import time
+import itertools
+import json
+import subprocess
+import multiprocessing as mp
+import os
+from typing import Callable, Dict, Optional
+import uuid
+
+import backoff
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from google.cloud import bigquery
+from google.cloud.bigquery.table import RowIterator
+from google.cloud import pubsub
+from google.cloud import storage
+import pytest
+
+dataflow = build("dataflow", "v1b3")
+
+# Default options.
+PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
+REGION: str = "us-west1"
+ZONE: str = "us-west1-b"
+
+RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
+
+
+@dataclass
+class Utils:
+    uuid: str = uuid.uuid4().hex[0:6]
+    project: str = PROJECT
+    region: str = REGION
+    zone: str = ZONE
+
+    @staticmethod
+    def storage_bucket(bucket_name: str) -> str:
+        storage_client = storage.Client()
+        bucket_unique_name = f"{bucket_name}-{Utils.uuid}"
+        bucket = storage_client.create_bucket(bucket_unique_name)
+
+        print(f"storage_bucket: {repr(bucket_unique_name)}")
+        yield bucket_unique_name
+
+        bucket.delete(force=True)
+
+    @staticmethod
+    def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
+        bigquery_client = bigquery.Client()
+        dataset = bigquery_client.create_dataset(
+            bigquery.Dataset(f"{project}.{dataset_name}_{Utils.uuid}")
+        )
+
+        print(f"bigquery_dataset: {dataset.full_dataset_id}")
+        yield dataset.full_dataset_id
+
+        bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
+
+    @staticmethod
+    def bigquery_query(query: str) -> RowIterator:
+        bigquery_client = bigquery.Client()
+        query_job = bigquery_client.query(query)
+        return query_job.result()
+
+    @staticmethod
+    def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
+        publisher_client = pubsub.PublisherClient()
+        topic_path = publisher_client.topic_path(project, f"{topic_name}-{Utils.uuid}")
+        topic = publisher_client.create_topic(topic_path)
+
+        print(f"pubsub_topic: {repr(topic.name)}")
+        yield topic.name
+
+        # Due to the pinned library dependencies in apache-beam, client
+        # library throws an error upon deletion.
+        # We use gcloud for a workaround. See also:
+        # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+        subprocess.check_call(
+            ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
+            check=True,
+        )
+
+    @staticmethod
+    def pubsub_subscription(
+        topic_path: str, subscription_name: str, project: str = PROJECT
+    ) -> str:
+        subscriber = pubsub.SubscriberClient()
+        subscription_path = subscriber.subscription_path(
+            project, f"{subscription_name}-{Utils.uuid}"
+        )
+        subscription = subscriber.create_subscription(subscription_path, topic_path)
+
+        print(f"pubsub_subscription: {repr(subscription.name)}")
+        yield subscription.name
+
+        # Due to the pinned library dependencies in apache-beam, client
+        # library throws an error upon deletion.
+        # We use gcloud for a workaround. See also:
+        # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+        subprocess.check_call(
+            [
+                "gcloud",
+                "pubsub",
+                "--project",
+                project,
+                "subscriptions",
+                "delete",
+                subscription_name,
+            ],
+            check=True,
+        )
+
+    @staticmethod
+    def pubsub_publisher(
+        topic_path: str,
+        new_msg: Callable[[int], str] = lambda i: json.dumps(
+            {"id": i, "content": f"message {i}"}
+        ),
+    ) -> bool:
+        def _infinite_publish_job() -> None:
+            publisher_client = pubsub.PublisherClient()
+            for i in itertools.count():
+                publisher_client.publish(
+                    topic_path, new_msg(i).encode("utf-8")
+                ).result()
+                time.sleep(1)
+
+        # Start a subprocess in the background to do the publishing.
+        p = mp.Process(target=_infinite_publish_job)
+        p.start()
+
+        yield p.is_alive()
+
+        # For cleanup, terminate the background process.
+        p.join(timeout=0)
+        p.terminate()
+
+    @staticmethod
+    def container_image(
+        image_path: str,
+        project: str = PROJECT,
+        tag: str = "latest",
+    ) -> str:
+        image_name = f"gcr.io/{project}/{image_path}-{Utils.uuid}:{tag}"
+        subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
+        subprocess.run(
+            [
+                "gcloud",
+                "builds",
+                "submit",
+                f"--project={project}",
+                f"--tag={image_name}",
+                ".",
+            ],
+            check=True,
+        )
+
+        yield image_name
+
+        subprocess.run(
+            [
+                "gcloud",
+                "container",
+                "images",
+                "delete",
+                image_name,
+                f"--project={project}",
+                "--quiet",
+            ],
+            check=True,
+        )
+
+    @staticmethod
+    def dataflow_job_id_from_job_name(
+        job_name: str, project: str = PROJECT
+    ) -> Optional[str]:
+        # Only return the 50 most recent results - our job is likely to be in here.
+        # If the job is not found, first try increasing this number.
+        # For more info see:
+        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
+        jobs_request = (
+            dataflow.projects()
+            .jobs()
+            .list(
+                projectId=project,
+                filter="ACTIVE",
+                pageSize=50,
+            )
+        )
+        response = jobs_request.execute()
+
+        # Search for the job in the list that has our name (names are unique)
+        for job in response["jobs"]:
+            if job["name"] == job_name:
+                return job["id"]
+        return None
+
+    @staticmethod
+    @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
+    def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
+        # To cancel a dataflow job, we need its ID, not its name
+        job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
+
+        if job_id is not None:
+            # Cancel the Dataflow job if it exists.
+            # If it doesn't, job_id will be equal to None.
+            # For more info, see:
+            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+            request = (
+                dataflow.projects()
+                .jobs()
+                .update(
+                    projectId=project,
+                    jobId=job_id,
+                    body={"requestedState": "JOB_STATE_CANCELLED"},
+                )
+            )
+            request.execute()
+
+    @staticmethod
+    def dataflow_flex_template_build(
+        bucket_name: str,
+        template_image: str,
+        metadata_file: str,
+        project: str = PROJECT,
+        template_file: str = "template.json",
+    ) -> str:
+        subprocess.call(
+            [
+                "gcloud",
+                "dataflow",
+                "flex-template",
+                "build",
+                f"gs://{bucket_name}/{template_file}",
+                f"--project={project}",
+                f"--image={template_image}",
+                "--sdk-language=PYTHON",
+                f"--metadata-file={metadata_file}",
+            ],
+            check=True,
+        )
+
+        yield f"gs://{bucket_name}/{template_file}"
+
+        storage_client = storage.Client()
+        storage_client.bucket(bucket_name).blob(template_file).delete()
+
+    @staticmethod
+    def dataflow_flex_template_run(
+        job_name: str,
+        template_path: str,
+        bucket_name: str,
+        parameters: Dict[str, str] = {},
+        project: str = PROJECT,
+        region: str = REGION,
+    ) -> str:
+        unique_job_name = f"{job_name}-{Utils.uuid}"
+        subprocess.call(
+            [
+                "gcloud",
+                "dataflow",
+                "flex-template",
+                "run",
+                unique_job_name,
+                f"--template-file-gcs-location={template_path}",
+                f"--project={project}",
+                f"--region={region}",
+                f"--temp_location=gs://{bucket_name}/temp",
+            ]
+            + [f"--parameters={name}={value}" for name, value in parameters.items()],
+            check=True,
+        )
+
+        yield unique_job_name
+
+
+@pytest.fixture(scope="session")
+def utils():
+    return Utils()
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index d043e52c0b8..1072765c351 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -10,103 +10,90 @@
 # distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
-import os
-import subprocess
+import json
+from conftest import Utils
 import time
-import uuid
 
-from google.cloud import bigquery
 import pytest
 
-from .. import testing_utils
-
-
-SUFFIX = uuid.uuid4().hex[0:6]
-PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
-BUCKET_NAME = f"flex-templates-streaming-beam-{SUFFIX}"
-BIGQUERY_DATASET = f"flex_templates_{SUFFIX}"
-BIGQUERY_TABLE = "streaming_beam"
-TOPIC = f"flex-templates-streaming-beam-{SUFFIX}"
-SUBSCRIPTION = TOPIC
-IMAGE_NAME = f"gcr.io/{PROJECT}/dataflow/flex-templates/streaming-beam-{SUFFIX}:latest"
-TEMPLATE_FILE = "template.json"
-REGION = "us-central1"
-
 
 @pytest.fixture(scope="session")
-def bucket_name() -> str:
-    return testing_utils.storage_bucket(BUCKET_NAME)
+def bucket_name(utils: Utils) -> str:
+    return utils.storage_bucket("dataflow-flex-templates-streaming-beam")
 
 
 @pytest.fixture(scope="session")
-def topic_path() -> str:
-    return testing_utils.pubsub_topic(PROJECT, TOPIC)
+def pubsub_topic(utils: Utils) -> str:
+    return utils.pubsub_topic("dataflow-flex-templates-streaming-beam")
 
 
 @pytest.fixture(scope="session")
-def subscription_path(topic_path: str) -> str:
-    return testing_utils.pubsub_subscription(PROJECT, topic_path, SUBSCRIPTION)
+def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str:
+    return utils.pubsub_subscription(
+        pubsub_topic, "dataflow-flex-templates-streaming-beam"
+    )
 
 
 @pytest.fixture(scope="session")
-def bigquery_dataset() -> str:
-    return testing_utils.bigquery_dataset(PROJECT, BIGQUERY_DATASET)
+def bigquery_dataset(utils: Utils) -> str:
+    return utils.bigquery_dataset("dataflow_flex_templates")
 
 
 @pytest.fixture(scope="session")
-def publisher(topic_path: str) -> bool:
-    return testing_utils.pubsub_publisher(topic_path)
+def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
+    return utils.pubsub_publisher(
+        pubsub_topic,
+        new_msg=lambda i: json.dumps(
+            {
+                "url": "https://beam.apache.org/",
+                "review": "positive" if i % 2 == 0 else "negative",
+            }
+        ),
+    )
 
 
 @pytest.fixture(scope="session")
-def template_image() -> str:
-    return testing_utils.container_image(PROJECT, IMAGE_NAME)
+def flex_template_image(utils: Utils) -> str:
+    return utils.container_image(f"dataflow/flex-templates/streaming-beam")
 
 
 @pytest.fixture(scope="session")
-def template_path(bucket_name: str, template_image: str) -> str:
-    return testing_utils.dataflow_flex_template_build(
+def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str:
+    return utils.dataflow_flex_template_build(
         bucket_name=bucket_name,
-        template_file=TEMPLATE_FILE,
-        template_image=template_image,
+        template_image=flex_template_image,
         metadata_file="metadata.json",
     )
 
 
 def test_run_template(
-    publisher: str,
+    utils: Utils,
     bucket_name: str,
-    template_path: str,
-    dataset: str,
-    subscription_path: str,
+    pubsub_publisher: str,
+    pubsub_subscription: str,
+    flex_template_path: str,
+    bigquery_dataset: str,
 ) -> None:
 
-    job_name = f"flex-templates-streaming-beam-{SUFFIX}"
-    subprocess.call(
-        [
-            "gcloud",
-            "dataflow",
-            "flex-template",
-            "run",
-            job_name,
-            f"--template-file-gcs-location={template_path}",
-            f"--temp_location=gs://{bucket_name}/temp",
-            f"--parameters=input_subscription={subscription_path}",
-            f"--parameters=output_table={dataset}.{BIGQUERY_TABLE}",
-            f"--region={REGION}",
-        ],
-        check=True,
+    bigquery_table = "streaming_beam"
+    job_name = utils.dataflow_flex_template_run(
+        job_name="flex-templates-streaming-beam",
+        template_path=flex_template_path,
+        bucket_name=bucket_name,
+        parameters={
+            "input_subscription": pubsub_subscription,
+            "output_table": f"{bigquery_dataset}.{bigquery_table}",
+        },
     )
 
+    # Since this is a streaming job, it will never finish running.
     # Wait for 10 minutes, and then cancel the job.
     time.sleep(10 * 60)
-    testing_utils.dataflow_jobs_cancel(PROJECT, job_name)
+    utils.dataflow_jobs_cancel(job_name)
 
     # Check for output data in BigQuery.
-    bigquery_client = bigquery.Client()
-    query = f"SELECT * FROM {PROJECT}.{BIGQUERY_DATASET}.{BIGQUERY_TABLE}"
-    query_job = bigquery_client.query(query)
-    rows = query_job.result()
+    query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}"
+    rows = utils.bigquery_query(query)
     assert rows.total_rows > 0
     for row in rows:
-        assert row["score"] == 1
+        assert "score" in row
diff --git a/dataflow/testing_utils.py b/dataflow/testing_utils.py
deleted file mode 100644
index f0cedcec36e..00000000000
--- a/dataflow/testing_utils.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from datetime import time
-import itertools
-import subprocess
-import multiprocessing as mp
-from typing import Callable, List, Optional
-
-import backoff
-from googleapiclient.discovery import build
-from googleapiclient.errors import HttpError
-from google.cloud import bigquery
-from google.cloud import pubsub
-from google.cloud import storage
-
-dataflow = build("dataflow", "v1b3")
-
-
-RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
-
-
-def storage_bucket(bucket_name: str) -> str:
-    storage_client = storage.Client()
-    bucket = storage_client.create_bucket(bucket_name)
-
-    print(f"storage_bucket: {repr(bucket_name)}")
-    yield bucket_name
-
-    bucket.delete(force=True)
-
-
-def bigquery_dataset(project: str, dataset_name: str) -> str:
-    bigquery_client = bigquery.Client()
-    dataset = bigquery.Dataset(f"{project}.{dataset_name}")
-    dataset = bigquery_client.create_dataset(dataset)
-
-    print(f"bigquery_dataset: {dataset.full_dataset_id}")
-    yield dataset.full_dataset_id
-
-    bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
-
-
-def pubsub_topic(project: str, topic_name: str) -> str:
-    publisher_client = pubsub.PublisherClient()
-    topic_path = publisher_client.topic_path(project, topic_name)
-    topic = publisher_client.create_topic(topic_path)
-
-    print(f"pubsub_topic: {repr(topic.name)}")
-    yield topic.name
-
-    # Due to the pinned library dependencies in apache-beam, client
-    # library throws an error upon deletion.
-    # We use gcloud for a workaround. See also:
-    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-    subprocess.check_call(
-        ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
-        check=True,
-    )
-
-
-def pubsub_subscription(project: str, topic_path: str, subscription_name: str) -> str:
-    subscriber = pubsub.SubscriberClient()
-    subscription_path = subscriber.subscription_path(project, subscription_name)
-    subscription = subscriber.create_subscription(subscription_path, topic_path)
-
-    print(f"pubsub_subscription: {repr(subscription.name)}")
-    yield subscription.name
-
-    # Due to the pinned library dependencies in apache-beam, client
-    # library throws an error upon deletion.
-    # We use gcloud for a workaround. See also:
-    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-    subprocess.check_call(
-        [
-            "gcloud",
-            "pubsub",
-            "--project",
-            project,
-            "subscriptions",
-            "delete",
-            subscription_name,
-        ],
-        check=True,
-    )
-
-
-def pubsub_publisher(topic_path: str, new_msg: Callable[[int], str]) -> bool:
-    def _infinite_publish_job() -> None:
-        publisher_client = pubsub.PublisherClient()
-        for i in itertools.count():
-            publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result()
-            time.sleep(1)
-
-    # Start a subprocess in the background to do the publishing.
-    p = mp.Process(target=_infinite_publish_job)
-    p.start()
-
-    yield p.is_alive()
-
-    # For cleanup, terminate the background process.
-    p.join(timeout=0)
-    p.terminate()
-
-
-def container_image(project: str, image_name: str) -> str:
-    subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
-    subprocess.run(
-        [
-            "gcloud",
-            "builds",
-            "submit",
-            f"--project={project}",
-            f"--tag={image_name}",
-            ".",
-        ],
-        check=True,
-    )
-
-    yield image_name
-
-    subprocess.run(
-        [
-            "gcloud",
-            "container",
-            "images",
-            "delete",
-            image_name,
-            f"--project={project}",
-            "--quiet",
-        ],
-        check=True,
-    )
-
-
-def dataflow_job_id_from_job_name(project: str, job_name: str) -> Optional[str]:
-    # Only return the 50 most recent results - our job is likely to be in here.
-    # If the job is not found, first try increasing this number.
-    # For more info see:
-    #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
-    jobs_request = (
-        dataflow.projects()
-        .jobs()
-        .list(
-            projectId=project,
-            filter="ACTIVE",
-            pageSize=50,
-        )
-    )
-    response = jobs_request.execute()
-
-    # Search for the job in the list that has our name (names are unique)
-    for job in response["jobs"]:
-        if job["name"] == job_name:
-            return job["id"]
-    return None
-
-
-@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
-def dataflow_jobs_cancel(project: str, job_name: str) -> None:
-    # To cancel a dataflow job, we need its ID, not its name
-    job_id = dataflow_job_id_from_job_name(project, job_name)
-
-    if job_id is not None:
-        # Cancel the Dataflow job if it exists.
-        # If it doesn't, job_id will be equal to None.
-        # For more info, see:
-        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-        request = (
-            dataflow.projects()
-            .jobs()
-            .update(
-                projectId=project,
-                jobId=job_id,
-                body={"requestedState": "JOB_STATE_CANCELLED"},
-            )
-        )
-        request.execute()
-
-
-def dataflow_flex_template_build(
-    bucket_name: str, template_file: str, template_image: str, metadata_file: str
-) -> str:
-    subprocess.call(
-        [
-            "gcloud",
-            "dataflow",
-            "flex-template",
-            "build",
-            f"gs://{bucket_name}/{template_file}",
-            f"--image={template_image}",
-            "--sdk-language=PYTHON",
-            f"--metadata-file={metadata_file}",
-        ],
-        check=True,
-    )
-
-    yield f"gs://{bucket_name}/{template_file}"
-
-    storage_client = storage.Client()
-    storage_client.bucket(bucket_name).blob(template_file).delete()

From 8417d63bed90835182c44e5ba913dab87a11fa86 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 13:48:44 -0700
Subject: [PATCH 08/57] fix lint issues

---
 dataflow/conftest.py                          | 489 +++++++++---------
 dataflow/flex-templates/__init__.py           |   2 -
 .../flex-templates/streaming_beam/e2e_test.py |  36 +-
 dataflow/run_template/main_test.py            | 111 ++--
 4 files changed, 320 insertions(+), 318 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 7995f922e1b..8bbb4d59f31 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -10,283 +10,268 @@
 # distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
-from dataclasses import dataclass
 from datetime import time
 import itertools
 import json
-import subprocess
 import multiprocessing as mp
 import os
+import subprocess
 from typing import Callable, Dict, Optional
 import uuid
 
 import backoff
-from googleapiclient.discovery import build
-from googleapiclient.errors import HttpError
 from google.cloud import bigquery
-from google.cloud.bigquery.table import RowIterator
 from google.cloud import pubsub
 from google.cloud import storage
-import pytest
+from google.cloud.bigquery.table import RowIterator
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
 
 dataflow = build("dataflow", "v1b3")
 
 # Default options.
+UUID = uuid.uuid4().hex[0:6]
 PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
-REGION: str = "us-west1"
-ZONE: str = "us-west1-b"
+REGION = "us-west1"
+ZONE = "us-west1-b"
 
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
 
 
-@dataclass
-class Utils:
-    uuid: str = uuid.uuid4().hex[0:6]
-    project: str = PROJECT
-    region: str = REGION
-    zone: str = ZONE
-
-    @staticmethod
-    def storage_bucket(bucket_name: str) -> str:
-        storage_client = storage.Client()
-        bucket_unique_name = f"{bucket_name}-{Utils.uuid}"
-        bucket = storage_client.create_bucket(bucket_unique_name)
-
-        print(f"storage_bucket: {repr(bucket_unique_name)}")
-        yield bucket_unique_name
-
-        bucket.delete(force=True)
-
-    @staticmethod
-    def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
-        bigquery_client = bigquery.Client()
-        dataset = bigquery_client.create_dataset(
-            bigquery.Dataset(f"{project}.{dataset_name}_{Utils.uuid}")
-        )
-
-        print(f"bigquery_dataset: {dataset.full_dataset_id}")
-        yield dataset.full_dataset_id
-
-        bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
-
-    @staticmethod
-    def bigquery_query(query: str) -> RowIterator:
-        bigquery_client = bigquery.Client()
-        query_job = bigquery_client.query(query)
-        return query_job.result()
-
-    @staticmethod
-    def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
+def storage_bucket(bucket_name: str) -> str:
+    storage_client = storage.Client()
+    bucket_unique_name = f"{bucket_name}-{UUID}"
+    bucket = storage_client.create_bucket(bucket_unique_name)
+
+    print(f"storage_bucket: {repr(bucket_unique_name)}")
+    yield bucket_unique_name
+
+    bucket.delete(force=True)
+
+
+def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
+    bigquery_client = bigquery.Client()
+    dataset = bigquery_client.create_dataset(
+        bigquery.Dataset(f"{project}.{dataset_name}_{UUID}")
+    )
+
+    print(f"bigquery_dataset: {dataset.full_dataset_id}")
+    yield dataset.full_dataset_id
+
+    bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
+
+
+def bigquery_query(query: str) -> RowIterator:
+    bigquery_client = bigquery.Client()
+    query_job = bigquery_client.query(query)
+    return query_job.result()
+
+
+def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
+    publisher_client = pubsub.PublisherClient()
+    topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}")
+    topic = publisher_client.create_topic(topic_path)
+
+    print(f"pubsub_topic: {repr(topic.name)}")
+    yield topic.name
+
+    # Due to the pinned library dependencies in apache-beam, client
+    # library throws an error upon deletion.
+    # We use gcloud for a workaround. See also:
+    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+    subprocess.check_call(
+        ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
+        check=True,
+    )
+
+
+def pubsub_subscription(
+    topic_path: str, subscription_name: str, project: str = PROJECT
+) -> str:
+    subscriber = pubsub.SubscriberClient()
+    subscription_path = subscriber.subscription_path(
+        project, f"{subscription_name}-{UUID}"
+    )
+    subscription = subscriber.create_subscription(subscription_path, topic_path)
+
+    print(f"pubsub_subscription: {repr(subscription.name)}")
+    yield subscription.name
+
+    # Due to the pinned library dependencies in apache-beam, client
+    # library throws an error upon deletion.
+    # We use gcloud for a workaround. See also:
+    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+    subprocess.check_call(
+        [
+            "gcloud",
+            "pubsub",
+            "--project",
+            project,
+            "subscriptions",
+            "delete",
+            subscription_name,
+        ],
+        check=True,
+    )
+
+
+def pubsub_publisher(
+    topic_path: str,
+    new_msg: Callable[[int], str] = lambda i: json.dumps(
+        {"id": i, "content": f"message {i}"}
+    ),
+) -> bool:
+    def _infinite_publish_job() -> None:
         publisher_client = pubsub.PublisherClient()
-        topic_path = publisher_client.topic_path(project, f"{topic_name}-{Utils.uuid}")
-        topic = publisher_client.create_topic(topic_path)
-
-        print(f"pubsub_topic: {repr(topic.name)}")
-        yield topic.name
-
-        # Due to the pinned library dependencies in apache-beam, client
-        # library throws an error upon deletion.
-        # We use gcloud for a workaround. See also:
-        # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-        subprocess.check_call(
-            ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
-            check=True,
-        )
-
-    @staticmethod
-    def pubsub_subscription(
-        topic_path: str, subscription_name: str, project: str = PROJECT
-    ) -> str:
-        subscriber = pubsub.SubscriberClient()
-        subscription_path = subscriber.subscription_path(
-            project, f"{subscription_name}-{Utils.uuid}"
-        )
-        subscription = subscriber.create_subscription(subscription_path, topic_path)
-
-        print(f"pubsub_subscription: {repr(subscription.name)}")
-        yield subscription.name
-
-        # Due to the pinned library dependencies in apache-beam, client
-        # library throws an error upon deletion.
-        # We use gcloud for a workaround. See also:
-        # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-        subprocess.check_call(
-            [
-                "gcloud",
-                "pubsub",
-                "--project",
-                project,
-                "subscriptions",
-                "delete",
-                subscription_name,
-            ],
-            check=True,
-        )
-
-    @staticmethod
-    def pubsub_publisher(
-        topic_path: str,
-        new_msg: Callable[[int], str] = lambda i: json.dumps(
-            {"id": i, "content": f"message {i}"}
-        ),
-    ) -> bool:
-        def _infinite_publish_job() -> None:
-            publisher_client = pubsub.PublisherClient()
-            for i in itertools.count():
-                publisher_client.publish(
-                    topic_path, new_msg(i).encode("utf-8")
-                ).result()
-                time.sleep(1)
-
-        # Start a subprocess in the background to do the publishing.
-        p = mp.Process(target=_infinite_publish_job)
-        p.start()
-
-        yield p.is_alive()
-
-        # For cleanup, terminate the background process.
-        p.join(timeout=0)
-        p.terminate()
-
-    @staticmethod
-    def container_image(
-        image_path: str,
-        project: str = PROJECT,
-        tag: str = "latest",
-    ) -> str:
-        image_name = f"gcr.io/{project}/{image_path}-{Utils.uuid}:{tag}"
-        subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
-        subprocess.run(
-            [
-                "gcloud",
-                "builds",
-                "submit",
-                f"--project={project}",
-                f"--tag={image_name}",
-                ".",
-            ],
-            check=True,
-        )
-
-        yield image_name
-
-        subprocess.run(
-            [
-                "gcloud",
-                "container",
-                "images",
-                "delete",
-                image_name,
-                f"--project={project}",
-                "--quiet",
-            ],
-            check=True,
+        for i in itertools.count():
+            publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result()
+            time.sleep(1)
+
+    # Start a subprocess in the background to do the publishing.
+    p = mp.Process(target=_infinite_publish_job)
+    p.start()
+
+    yield p.is_alive()
+
+    # For cleanup, terminate the background process.
+    p.join(timeout=0)
+    p.terminate()
+
+
+def container_image(
+    image_path: str,
+    project: str = PROJECT,
+    tag: str = "latest",
+) -> str:
+    image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}"
+    subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
+    subprocess.run(
+        [
+            "gcloud",
+            "builds",
+            "submit",
+            f"--project={project}",
+            f"--tag={image_name}",
+            ".",
+        ],
+        check=True,
+    )
+
+    yield image_name
+
+    subprocess.run(
+        [
+            "gcloud",
+            "container",
+            "images",
+            "delete",
+            image_name,
+            f"--project={project}",
+            "--quiet",
+        ],
+        check=True,
+    )
+
+
+def dataflow_job_id_from_job_name(
+    job_name: str, project: str = PROJECT
+) -> Optional[str]:
+    # Only return the 50 most recent results - our job is likely to be in here.
+    # If the job is not found, first try increasing this number.
+    # For more info see:
+    #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
+    jobs_request = (
+        dataflow.projects()
+        .jobs()
+        .list(
+            projectId=project,
+            filter="ACTIVE",
+            pageSize=50,
         )
-
-    @staticmethod
-    def dataflow_job_id_from_job_name(
-        job_name: str, project: str = PROJECT
-    ) -> Optional[str]:
-        # Only return the 50 most recent results - our job is likely to be in here.
-        # If the job is not found, first try increasing this number.
-        # For more info see:
-        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
-        jobs_request = (
+    )
+    response = jobs_request.execute()
+
+    # Search for the job in the list that has our name (names are unique)
+    for job in response["jobs"]:
+        if job["name"] == job_name:
+            return job["id"]
+    return None
+
+
+@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
+def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
+    # To cancel a dataflow job, we need its ID, not its name
+    job_id = dataflow_job_id_from_job_name(project, job_name)
+
+    if job_id is not None:
+        # Cancel the Dataflow job if it exists.
+        # If it doesn't, job_id will be equal to None.
+        # For more info, see:
+        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+        request = (
             dataflow.projects()
             .jobs()
-            .list(
+            .update(
                 projectId=project,
-                filter="ACTIVE",
-                pageSize=50,
-            )
-        )
-        response = jobs_request.execute()
-
-        # Search for the job in the list that has our name (names are unique)
-        for job in response["jobs"]:
-            if job["name"] == job_name:
-                return job["id"]
-        return None
-
-    @staticmethod
-    @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
-    def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
-        # To cancel a dataflow job, we need its ID, not its name
-        job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
-
-        if job_id is not None:
-            # Cancel the Dataflow job if it exists.
-            # If it doesn't, job_id will be equal to None.
-            # For more info, see:
-            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-            request = (
-                dataflow.projects()
-                .jobs()
-                .update(
-                    projectId=project,
-                    jobId=job_id,
-                    body={"requestedState": "JOB_STATE_CANCELLED"},
-                )
+                jobId=job_id,
+                body={"requestedState": "JOB_STATE_CANCELLED"},
             )
-            request.execute()
-
-    @staticmethod
-    def dataflow_flex_template_build(
-        bucket_name: str,
-        template_image: str,
-        metadata_file: str,
-        project: str = PROJECT,
-        template_file: str = "template.json",
-    ) -> str:
-        subprocess.call(
-            [
-                "gcloud",
-                "dataflow",
-                "flex-template",
-                "build",
-                f"gs://{bucket_name}/{template_file}",
-                f"--project={project}",
-                f"--image={template_image}",
-                "--sdk-language=PYTHON",
-                f"--metadata-file={metadata_file}",
-            ],
-            check=True,
-        )
-
-        yield f"gs://{bucket_name}/{template_file}"
-
-        storage_client = storage.Client()
-        storage_client.bucket(bucket_name).blob(template_file).delete()
-
-    @staticmethod
-    def dataflow_flex_template_run(
-        job_name: str,
-        template_path: str,
-        bucket_name: str,
-        parameters: Dict[str, str] = {},
-        project: str = PROJECT,
-        region: str = REGION,
-    ) -> str:
-        unique_job_name = f"{job_name}-{Utils.uuid}"
-        subprocess.call(
-            [
-                "gcloud",
-                "dataflow",
-                "flex-template",
-                "run",
-                unique_job_name,
-                f"--template-file-gcs-location={template_path}",
-                f"--project={project}",
-                f"--region={region}",
-                f"--temp_location=gs://{bucket_name}/temp",
-            ]
-            + [f"--parameters={name}={value}" for name, value in parameters.items()],
-            check=True,
         )
-
-        yield unique_job_name
-
-
-@pytest.fixture(scope="session")
-def utils():
-    return Utils()
+        request.execute()
+
+
+@staticmethod
+def dataflow_flex_template_build(
+    bucket_name: str,
+    template_image: str,
+    metadata_file: str,
+    project: str = PROJECT,
+    template_file: str = "template.json",
+) -> str:
+    subprocess.call(
+        [
+            "gcloud",
+            "dataflow",
+            "flex-template",
+            "build",
+            f"gs://{bucket_name}/{template_file}",
+            f"--project={project}",
+            f"--image={template_image}",
+            "--sdk-language=PYTHON",
+            f"--metadata-file={metadata_file}",
+        ],
+        check=True,
+    )
+
+    yield f"gs://{bucket_name}/{template_file}"
+
+    storage_client = storage.Client()
+    storage_client.bucket(bucket_name).blob(template_file).delete()
+
+
+def dataflow_flex_template_run(
+    job_name: str,
+    template_path: str,
+    bucket_name: str,
+    parameters: Dict[str, str] = {},
+    project: str = PROJECT,
+    region: str = REGION,
+) -> str:
+    unique_job_name = f"{job_name}-{UUID}"
+    subprocess.call(
+        [
+            "gcloud",
+            "dataflow",
+            "flex-template",
+            "run",
+            unique_job_name,
+            f"--template-file-gcs-location={template_path}",
+            f"--project={project}",
+            f"--region={region}",
+            f"--temp_location=gs://{bucket_name}/temp",
+        ]
+        + [f"--parameters={name}={value}" for name, value in parameters.items()],
+        check=True,
+    )
+
+    yield unique_job_name
diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py
index e7ed7703169..ffc78f34e19 100644
--- a/dataflow/flex-templates/__init__.py
+++ b/dataflow/flex-templates/__init__.py
@@ -9,5 +9,3 @@
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-from .. import testing_utils
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index 1072765c351..b366ff90b40 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -11,36 +11,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
 import json
-from conftest import Utils
 import time
 
 import pytest
 
+import conftest as utils
+
+NAME = "dataflow-flex-templates-streaming-beam"
+
 
 @pytest.fixture(scope="session")
-def bucket_name(utils: Utils) -> str:
-    return utils.storage_bucket("dataflow-flex-templates-streaming-beam")
+def bucket_name() -> str:
+    return utils.storage_bucket(NAME)
 
 
 @pytest.fixture(scope="session")
-def pubsub_topic(utils: Utils) -> str:
-    return utils.pubsub_topic("dataflow-flex-templates-streaming-beam")
+def pubsub_topic() -> str:
+    return utils.pubsub_topic(NAME)
 
 
 @pytest.fixture(scope="session")
-def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str:
-    return utils.pubsub_subscription(
-        pubsub_topic, "dataflow-flex-templates-streaming-beam"
-    )
+def pubsub_subscription(pubsub_topic: str) -> str:
+    return utils.pubsub_subscription(pubsub_topic, NAME)
 
 
 @pytest.fixture(scope="session")
-def bigquery_dataset(utils: Utils) -> str:
-    return utils.bigquery_dataset("dataflow_flex_templates")
+def bigquery_dataset() -> str:
+    return utils.bigquery_dataset(NAME.replace("-", "_"))
 
 
 @pytest.fixture(scope="session")
-def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
+def pubsub_publisher(pubsub_topic: str) -> bool:
     return utils.pubsub_publisher(
         pubsub_topic,
         new_msg=lambda i: json.dumps(
@@ -53,12 +54,12 @@ def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
 
 
 @pytest.fixture(scope="session")
-def flex_template_image(utils: Utils) -> str:
-    return utils.container_image(f"dataflow/flex-templates/streaming-beam")
+def flex_template_image() -> str:
+    return utils.container_image(NAME)
 
 
 @pytest.fixture(scope="session")
-def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str:
+def flex_template_path(bucket_name: str, flex_template_image: str) -> str:
     return utils.dataflow_flex_template_build(
         bucket_name=bucket_name,
         template_image=flex_template_image,
@@ -67,7 +68,6 @@ def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str)
 
 
 def test_run_template(
-    utils: Utils,
     bucket_name: str,
     pubsub_publisher: str,
     pubsub_subscription: str,
@@ -75,9 +75,9 @@ def test_run_template(
     bigquery_dataset: str,
 ) -> None:
 
-    bigquery_table = "streaming_beam"
+    bigquery_table = "output_table"
     job_name = utils.dataflow_flex_template_run(
-        job_name="flex-templates-streaming-beam",
+        job_name=NAME,
         template_path=flex_template_path,
         bucket_name=bucket_name,
         parameters={
diff --git a/dataflow/run_template/main_test.py b/dataflow/run_template/main_test.py
index 6a5b9792692..1c20aeed1b4 100644
--- a/dataflow/run_template/main_test.py
+++ b/dataflow/run_template/main_test.py
@@ -31,15 +31,15 @@
 
 from werkzeug.urls import url_encode
 
-import main
+from . import main
 
 
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
 
-PROJECT = os.environ['GOOGLE_CLOUD_PROJECT']
-BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
+PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
+BUCKET = os.environ["CLOUD_STORAGE_BUCKET"]
 
-dataflow = build('dataflow', 'v1b3')
+dataflow = build("dataflow", "v1b3")
 
 
 # Create a fake "app" for generating test request contexts.
@@ -53,8 +53,9 @@ def app():
 @pytest.fixture(scope="function")
 def dataflow_job_name(request):
     label = request.param
-    job_name = datetime.now().strftime('{}-%Y%m%d-%H%M%S-{}'.format(
-        label, uuid.uuid4().hex[:5]))
+    job_name = datetime.now().strftime(
+        "{}-%Y%m%d-%H%M%S-{}".format(label, uuid.uuid4().hex[:5])
+    )
 
     yield job_name
 
@@ -69,17 +70,21 @@ def dataflow_job_name(request):
 # Takes in a Dataflow job name and returns its job ID
 def get_job_id_from_name(job_name):
     # list the 50 most recent Dataflow jobs
-    jobs_request = dataflow.projects().jobs().list(
-        projectId=PROJECT,
-        filter="ACTIVE",
-        pageSize=50  # only return the 50 most recent results - our job is likely to be in here. If the job is not found, first try increasing this number. For more info see:https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
+    jobs_request = (
+        dataflow.projects()
+        .jobs()
+        .list(
+            projectId=PROJECT,
+            filter="ACTIVE",
+            pageSize=50,  # only return the 50 most recent results - our job is likely to be in here. If the job is not found, first try increasing this number. For more info see:https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
+        )
     )
     response = jobs_request.execute()
 
     # search for the job in the list that has our name (names are unique)
-    for job in response['jobs']:
-        if job['name'] == job_name:
-            return job['id']
+    for job in response["jobs"]:
+        if job["name"] == job_name:
+            return job["id"]
     # if we don't find a job, just return
     return
 
@@ -92,32 +97,40 @@ def dataflow_jobs_cancel(job_name):
 
     if job_id:
         # Cancel the Dataflow job if it exists. If it doesn't, job_id will be equal to None. For more info, see: https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-        request = dataflow.projects().jobs().update(
-            projectId=PROJECT,
-            jobId=job_id,
-            body={'requestedState': 'JOB_STATE_CANCELLED'}
+        request = (
+            dataflow.projects()
+            .jobs()
+            .update(
+                projectId=PROJECT,
+                jobId=job_id,
+                body={"requestedState": "JOB_STATE_CANCELLED"},
+            )
         )
         request.execute()
 
 
-@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_empty')], indirect=True)
+@pytest.mark.parametrize(
+    "dataflow_job_name", [("test_run_template_empty")], indirect=True
+)
 def test_run_template_python_empty_args(app, dataflow_job_name):
     project = PROJECT
-    template = 'gs://dataflow-templates/latest/Word_Count'
+    template = "gs://dataflow-templates/latest/Word_Count"
     with pytest.raises(HttpError):
         main.run(project, dataflow_job_name, template)
 
 
-@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_python')], indirect=True)
+@pytest.mark.parametrize(
+    "dataflow_job_name", [("test_run_template_python")], indirect=True
+)
 def test_run_template_python(app, dataflow_job_name):
     project = PROJECT
-    template = 'gs://dataflow-templates/latest/Word_Count'
+    template = "gs://dataflow-templates/latest/Word_Count"
     parameters = {
-        'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt',
-        'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET),
+        "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt",
+        "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET),
     }
     res = main.run(project, dataflow_job_name, template, parameters)
-    assert 'test_run_template_python' in res['job']['name']
+    assert "test_run_template_python" in res["job"]["name"]
 
 
 def test_run_template_http_empty_args(app):
@@ -126,46 +139,52 @@ def test_run_template_http_empty_args(app):
             main.run_template(flask.request)
 
 
-@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_url')], indirect=True)
+@pytest.mark.parametrize(
+    "dataflow_job_name", [("test_run_template_url")], indirect=True
+)
 def test_run_template_http_url(app, dataflow_job_name):
     args = {
-        'project': PROJECT,
-        'job': dataflow_job_name,
-        'template': 'gs://dataflow-templates/latest/Word_Count',
-        'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt',
-        'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET),
+        "project": PROJECT,
+        "job": dataflow_job_name,
+        "template": "gs://dataflow-templates/latest/Word_Count",
+        "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt",
+        "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET),
     }
-    with app.test_request_context('/?' + url_encode(args)):
+    with app.test_request_context("/?" + url_encode(args)):
         res = main.run_template(flask.request)
         data = json.loads(res)
-        assert 'test_run_template_url' in data['job']['name']
+        assert "test_run_template_url" in data["job"]["name"]
 
 
-@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_data')], indirect=True)
+@pytest.mark.parametrize(
+    "dataflow_job_name", [("test_run_template_data")], indirect=True
+)
 def test_run_template_http_data(app, dataflow_job_name):
     args = {
-        'project': PROJECT,
-        'job': dataflow_job_name,
-        'template': 'gs://dataflow-templates/latest/Word_Count',
-        'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt',
-        'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET),
+        "project": PROJECT,
+        "job": dataflow_job_name,
+        "template": "gs://dataflow-templates/latest/Word_Count",
+        "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt",
+        "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET),
     }
     with app.test_request_context(data=args):
         res = main.run_template(flask.request)
         data = json.loads(res)
-        assert 'test_run_template_data' in data['job']['name']
+        assert "test_run_template_data" in data["job"]["name"]
 
 
-@pytest.mark.parametrize('dataflow_job_name', [('test_run_template_json')], indirect=True)
+@pytest.mark.parametrize(
+    "dataflow_job_name", [("test_run_template_json")], indirect=True
+)
 def test_run_template_http_json(app, dataflow_job_name):
     args = {
-        'project': PROJECT,
-        'job': dataflow_job_name,
-        'template': 'gs://dataflow-templates/latest/Word_Count',
-        'inputFile': 'gs://apache-beam-samples/shakespeare/kinglear.txt',
-        'output': 'gs://{}/dataflow/wordcount/outputs'.format(BUCKET),
+        "project": PROJECT,
+        "job": dataflow_job_name,
+        "template": "gs://dataflow-templates/latest/Word_Count",
+        "inputFile": "gs://apache-beam-samples/shakespeare/kinglear.txt",
+        "output": "gs://{}/dataflow/wordcount/outputs".format(BUCKET),
     }
     with app.test_request_context(json=args):
         res = main.run_template(flask.request)
         data = json.loads(res)
-        assert 'test_run_template_json' in data['job']['name']
+        assert "test_run_template_json" in data["job"]["name"]

From e31334c5a517b384e1477f2ccc1cc57247546ec3 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 14:03:21 -0700
Subject: [PATCH 09/57] fix import order

---
 dataflow/flex-templates/streaming_beam/e2e_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index b366ff90b40..ebbb7f9a568 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -13,9 +13,8 @@
 import json
 import time
 
-import pytest
-
 import conftest as utils
+import pytest
 
 NAME = "dataflow-flex-templates-streaming-beam"
 

From b157d5443d34fd9903e3612ee9ee7944ce9b9616 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 14:07:26 -0700
Subject: [PATCH 10/57] add google-cloud-storage

---
 dataflow/flex-templates/streaming_beam/requirements-test.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
index 16def8b7d52..0cee7f38e5a 100644
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt
@@ -1,4 +1,5 @@
 backoff==1.10.0
 google-api-python-client==2.1.0
+google-cloud-storage==1.38.0
 pytest-xdist==2.2.1
-pytest==6.2.1
+pytest==6.2.1
\ No newline at end of file

From 48232b60acb9de7ee3647d4e4810a1e754caeb12 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 14:36:44 -0700
Subject: [PATCH 11/57] make external library imports local

---
 dataflow/conftest.py | 80 +++++++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 8bbb4d59f31..2de0a6e825a 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -16,18 +16,9 @@
 import multiprocessing as mp
 import os
 import subprocess
-from typing import Callable, Dict, Optional
+from typing import Any, Callable, Dict, Iterable, Optional
 import uuid
 
-import backoff
-from google.cloud import bigquery
-from google.cloud import pubsub
-from google.cloud import storage
-from google.cloud.bigquery.table import RowIterator
-from googleapiclient.discovery import build
-from googleapiclient.errors import HttpError
-
-dataflow = build("dataflow", "v1b3")
 
 # Default options.
 UUID = uuid.uuid4().hex[0:6]
@@ -39,6 +30,8 @@
 
 
 def storage_bucket(bucket_name: str) -> str:
+    from google.cloud import storage
+
     storage_client = storage.Client()
     bucket_unique_name = f"{bucket_name}-{UUID}"
     bucket = storage_client.create_bucket(bucket_unique_name)
@@ -50,6 +43,8 @@ def storage_bucket(bucket_name: str) -> str:
 
 
 def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
+    from google.cloud import bigquery
+
     bigquery_client = bigquery.Client()
     dataset = bigquery_client.create_dataset(
         bigquery.Dataset(f"{project}.{dataset_name}_{UUID}")
@@ -61,13 +56,17 @@ def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
     bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
 
 
-def bigquery_query(query: str) -> RowIterator:
+def bigquery_query(query: str) -> Iterable[Dict[str, Any]]:
+    from google.cloud import bigquery
+
     bigquery_client = bigquery.Client()
-    query_job = bigquery_client.query(query)
-    return query_job.result()
+    for row in bigquery_client.query(query):
+        yield dict(row)
 
 
 def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
+    from google.cloud import pubsub
+
     publisher_client = pubsub.PublisherClient()
     topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}")
     topic = publisher_client.create_topic(topic_path)
@@ -88,6 +87,8 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
 def pubsub_subscription(
     topic_path: str, subscription_name: str, project: str = PROJECT
 ) -> str:
+    from google.cloud import pubsub
+
     subscriber = pubsub.SubscriberClient()
     subscription_path = subscriber.subscription_path(
         project, f"{subscription_name}-{UUID}"
@@ -121,6 +122,8 @@ def pubsub_publisher(
         {"id": i, "content": f"message {i}"}
     ),
 ) -> bool:
+    from google.cloud import pubsub
+
     def _infinite_publish_job() -> None:
         publisher_client = pubsub.PublisherClient()
         for i in itertools.count():
@@ -176,6 +179,10 @@ def container_image(
 def dataflow_job_id_from_job_name(
     job_name: str, project: str = PROJECT
 ) -> Optional[str]:
+    from googleapiclient.discovery import build
+
+    dataflow = build("dataflow", "v1b3")
+
     # Only return the 50 most recent results - our job is likely to be in here.
     # If the job is not found, first try increasing this number.
     # For more info see:
@@ -198,26 +205,35 @@ def dataflow_job_id_from_job_name(
     return None
 
 
-@backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
 def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
-    # To cancel a dataflow job, we need its ID, not its name
-    job_id = dataflow_job_id_from_job_name(project, job_name)
-
-    if job_id is not None:
-        # Cancel the Dataflow job if it exists.
-        # If it doesn't, job_id will be equal to None.
-        # For more info, see:
-        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-        request = (
-            dataflow.projects()
-            .jobs()
-            .update(
-                projectId=project,
-                jobId=job_id,
-                body={"requestedState": "JOB_STATE_CANCELLED"},
+    import backoff
+    from googleapiclient.discovery import build
+    from googleapiclient.errors import HttpError
+
+    dataflow = build("dataflow", "v1b3")
+
+    @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
+    def cancel():
+        # To cancel a dataflow job, we need its ID, not its name
+        job_id = dataflow_job_id_from_job_name(project, job_name)
+
+        if job_id is not None:
+            # Cancel the Dataflow job if it exists.
+            # If it doesn't, job_id will be equal to None.
+            # For more info, see:
+            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+            request = (
+                dataflow.projects()
+                .jobs()
+                .update(
+                    projectId=project,
+                    jobId=job_id,
+                    body={"requestedState": "JOB_STATE_CANCELLED"},
+                )
             )
-        )
-        request.execute()
+            request.execute()
+
+    cancel()
 
 
 @staticmethod
@@ -228,6 +244,8 @@ def dataflow_flex_template_build(
     project: str = PROJECT,
     template_file: str = "template.json",
 ) -> str:
+    from google.cloud import storage
+
     subprocess.call(
         [
             "gcloud",

From b3921af78f9e42fd558f6ba95d5fd5dd5264f44a Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 14:36:57 -0700
Subject: [PATCH 12/57] update checks

---
 dataflow/flex-templates/streaming_beam/e2e_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index ebbb7f9a568..60683a1bfcf 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -92,7 +92,7 @@ def test_run_template(
 
     # Check for output data in BigQuery.
     query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}"
-    rows = utils.bigquery_query(query)
-    assert rows.total_rows > 0
+    rows = list(utils.bigquery_query(query))
+    assert len(rows) > 0
     for row in rows:
         assert "score" in row

From 1e1b0c0c24c8c7a748df9cf578614d1e8ffa6d8d Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 27 Apr 2021 14:44:03 -0700
Subject: [PATCH 13/57] access through fixture

---
 dataflow/conftest.py                          | 505 +++++++++---------
 .../flex-templates/streaming_beam/e2e_test.py |  20 +-
 2 files changed, 273 insertions(+), 252 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 2de0a6e825a..a7e6f713419 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -10,6 +10,7 @@
 # distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
+from dataclasses import dataclass
 from datetime import time
 import itertools
 import json
@@ -19,6 +20,7 @@
 from typing import Any, Callable, Dict, Iterable, Optional
 import uuid
 
+import pytest
 
 # Default options.
 UUID = uuid.uuid4().hex[0:6]
@@ -29,267 +31,282 @@
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
 
 
-def storage_bucket(bucket_name: str) -> str:
-    from google.cloud import storage
+class Utils:
+    uuid: str = UUID
+    project: str = PROJECT
+    region: str = REGION
+    zone: str = ZONE
 
-    storage_client = storage.Client()
-    bucket_unique_name = f"{bucket_name}-{UUID}"
-    bucket = storage_client.create_bucket(bucket_unique_name)
+    @staticmethod
+    def storage_bucket(bucket_name: str) -> str:
+        from google.cloud import storage
 
-    print(f"storage_bucket: {repr(bucket_unique_name)}")
-    yield bucket_unique_name
+        storage_client = storage.Client()
+        bucket_unique_name = f"{bucket_name}-{UUID}"
+        bucket = storage_client.create_bucket(bucket_unique_name)
 
-    bucket.delete(force=True)
+        print(f"storage_bucket: {repr(bucket_unique_name)}")
+        yield bucket_unique_name
 
+        bucket.delete(force=True)
 
-def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
-    from google.cloud import bigquery
+    @staticmethod
+    def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
+        from google.cloud import bigquery
 
-    bigquery_client = bigquery.Client()
-    dataset = bigquery_client.create_dataset(
-        bigquery.Dataset(f"{project}.{dataset_name}_{UUID}")
-    )
-
-    print(f"bigquery_dataset: {dataset.full_dataset_id}")
-    yield dataset.full_dataset_id
-
-    bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
-
-
-def bigquery_query(query: str) -> Iterable[Dict[str, Any]]:
-    from google.cloud import bigquery
+        bigquery_client = bigquery.Client()
+        dataset = bigquery_client.create_dataset(
+            bigquery.Dataset(f"{project}.{dataset_name}_{UUID}")
+        )
 
-    bigquery_client = bigquery.Client()
-    for row in bigquery_client.query(query):
-        yield dict(row)
+        print(f"bigquery_dataset: {dataset.full_dataset_id}")
+        yield dataset.full_dataset_id
 
+        bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
 
-def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
-    from google.cloud import pubsub
+    @staticmethod
+    def bigquery_query(query: str) -> Iterable[Dict[str, Any]]:
+        from google.cloud import bigquery
 
-    publisher_client = pubsub.PublisherClient()
-    topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}")
-    topic = publisher_client.create_topic(topic_path)
+        bigquery_client = bigquery.Client()
+        for row in bigquery_client.query(query):
+            yield dict(row)
 
-    print(f"pubsub_topic: {repr(topic.name)}")
-    yield topic.name
+    @staticmethod
+    def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
+        from google.cloud import pubsub
 
-    # Due to the pinned library dependencies in apache-beam, client
-    # library throws an error upon deletion.
-    # We use gcloud for a workaround. See also:
-    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-    subprocess.check_call(
-        ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
-        check=True,
-    )
+        publisher_client = pubsub.PublisherClient()
+        topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}")
+        topic = publisher_client.create_topic(topic_path)
+
+        print(f"pubsub_topic: {repr(topic.name)}")
+        yield topic.name
+
+        # Due to the pinned library dependencies in apache-beam, client
+        # library throws an error upon deletion.
+        # We use gcloud for a workaround. See also:
+        # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+        subprocess.check_call(
+            ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
+            check=True,
+        )
 
+    @staticmethod
+    def pubsub_subscription(
+        topic_path: str,
+        subscription_name: str,
+        project: str = PROJECT,
+    ) -> str:
+        from google.cloud import pubsub
+
+        subscriber = pubsub.SubscriberClient()
+        subscription_path = subscriber.subscription_path(
+            project, f"{subscription_name}-{UUID}"
+        )
+        subscription = subscriber.create_subscription(subscription_path, topic_path)
+
+        print(f"pubsub_subscription: {repr(subscription.name)}")
+        yield subscription.name
+
+        # Due to the pinned library dependencies in apache-beam, client
+        # library throws an error upon deletion.
+        # We use gcloud for a workaround. See also:
+        # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
+        subprocess.check_call(
+            [
+                "gcloud",
+                "pubsub",
+                "--project",
+                project,
+                "subscriptions",
+                "delete",
+                subscription_name,
+            ],
+            check=True,
+        )
 
-def pubsub_subscription(
-    topic_path: str, subscription_name: str, project: str = PROJECT
-) -> str:
-    from google.cloud import pubsub
+    @staticmethod
+    def pubsub_publisher(
+        topic_path: str,
+        new_msg: Callable[[int], str] = lambda i: json.dumps(
+            {"id": i, "content": f"message {i}"}
+        ),
+    ) -> bool:
+        from google.cloud import pubsub
+
+        def _infinite_publish_job() -> None:
+            publisher_client = pubsub.PublisherClient()
+            for i in itertools.count():
+                publisher_client.publish(
+                    topic_path, new_msg(i).encode("utf-8")
+                ).result()
+                time.sleep(1)
+
+        # Start a subprocess in the background to do the publishing.
+        p = mp.Process(target=_infinite_publish_job)
+        p.start()
+
+        yield p.is_alive()
+
+        # For cleanup, terminate the background process.
+        p.join(timeout=0)
+        p.terminate()
+
+    @staticmethod
+    def container_image(
+        image_path: str,
+        project: str = PROJECT,
+        tag: str = "latest",
+    ) -> str:
+        image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}"
+        subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
+        subprocess.run(
+            [
+                "gcloud",
+                "builds",
+                "submit",
+                f"--project={project}",
+                f"--tag={image_name}",
+                ".",
+            ],
+            check=True,
+        )
 
-    subscriber = pubsub.SubscriberClient()
-    subscription_path = subscriber.subscription_path(
-        project, f"{subscription_name}-{UUID}"
-    )
-    subscription = subscriber.create_subscription(subscription_path, topic_path)
+        yield image_name
+
+        subprocess.run(
+            [
+                "gcloud",
+                "container",
+                "images",
+                "delete",
+                image_name,
+                f"--project={project}",
+                "--quiet",
+            ],
+            check=True,
+        )
 
-    print(f"pubsub_subscription: {repr(subscription.name)}")
-    yield subscription.name
+    @staticmethod
+    def dataflow_job_id_from_job_name(
+        job_name: str,
+        project: str = PROJECT,
+    ) -> Optional[str]:
+        from googleapiclient.discovery import build
+
+        dataflow = build("dataflow", "v1b3")
+
+        # Only return the 50 most recent results - our job is likely to be in here.
+        # If the job is not found, first try increasing this number.
+        # For more info see:
+        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
+        jobs_request = (
+            dataflow.projects()
+            .jobs()
+            .list(
+                projectId=project,
+                filter="ACTIVE",
+                pageSize=50,
+            )
+        )
+        response = jobs_request.execute()
+
+        # Search for the job in the list that has our name (names are unique)
+        for job in response["jobs"]:
+            if job["name"] == job_name:
+                return job["id"]
+        return None
+
+    @staticmethod
+    def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
+        import backoff
+        from googleapiclient.discovery import build
+        from googleapiclient.errors import HttpError
+
+        dataflow = build("dataflow", "v1b3")
+
+        @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
+        def cancel():
+            # To cancel a dataflow job, we need its ID, not its name
+            job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
+
+            if job_id is not None:
+                # Cancel the Dataflow job if it exists.
+                # If it doesn't, job_id will be equal to None.
+                # For more info, see:
+                #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+                request = (
+                    dataflow.projects()
+                    .jobs()
+                    .update(
+                        projectId=project,
+                        jobId=job_id,
+                        body={"requestedState": "JOB_STATE_CANCELLED"},
+                    )
+                )
+                request.execute()
+
+        cancel()
+
+    @staticmethod
+    def dataflow_flex_template_build(
+        bucket_name: str,
+        template_image: str,
+        metadata_file: str,
+        project: str = PROJECT,
+        template_file: str = "template.json",
+    ) -> str:
+        from google.cloud import storage
+
+        subprocess.call(
+            [
+                "gcloud",
+                "dataflow",
+                "flex-template",
+                "build",
+                f"gs://{bucket_name}/{template_file}",
+                f"--project={project}",
+                f"--image={template_image}",
+                "--sdk-language=PYTHON",
+                f"--metadata-file={metadata_file}",
+            ],
+            check=True,
+        )
 
-    # Due to the pinned library dependencies in apache-beam, client
-    # library throws an error upon deletion.
-    # We use gcloud for a workaround. See also:
-    # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-    subprocess.check_call(
-        [
-            "gcloud",
-            "pubsub",
-            "--project",
-            project,
-            "subscriptions",
-            "delete",
-            subscription_name,
-        ],
-        check=True,
-    )
+        yield f"gs://{bucket_name}/{template_file}"
+
+        storage_client = storage.Client()
+        storage_client.bucket(bucket_name).blob(template_file).delete()
+
+    def dataflow_flex_template_run(
+        job_name: str,
+        template_path: str,
+        bucket_name: str,
+        parameters: Dict[str, str] = {},
+        project: str = PROJECT,
+        region: str = REGION,
+    ) -> str:
+        unique_job_name = f"{job_name}-{UUID}"
+        subprocess.call(
+            [
+                "gcloud",
+                "dataflow",
+                "flex-template",
+                "run",
+                unique_job_name,
+                f"--template-file-gcs-location={template_path}",
+                f"--project={project}",
+                f"--region={region}",
+                f"--temp_location=gs://{bucket_name}/temp",
+            ]
+            + [f"--parameters={name}={value}" for name, value in parameters.items()],
+            check=True,
+        )
 
+        yield unique_job_name
 
-def pubsub_publisher(
-    topic_path: str,
-    new_msg: Callable[[int], str] = lambda i: json.dumps(
-        {"id": i, "content": f"message {i}"}
-    ),
-) -> bool:
-    from google.cloud import pubsub
 
-    def _infinite_publish_job() -> None:
-        publisher_client = pubsub.PublisherClient()
-        for i in itertools.count():
-            publisher_client.publish(topic_path, new_msg(i).encode("utf-8")).result()
-            time.sleep(1)
-
-    # Start a subprocess in the background to do the publishing.
-    p = mp.Process(target=_infinite_publish_job)
-    p.start()
-
-    yield p.is_alive()
-
-    # For cleanup, terminate the background process.
-    p.join(timeout=0)
-    p.terminate()
-
-
-def container_image(
-    image_path: str,
-    project: str = PROJECT,
-    tag: str = "latest",
-) -> str:
-    image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}"
-    subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
-    subprocess.run(
-        [
-            "gcloud",
-            "builds",
-            "submit",
-            f"--project={project}",
-            f"--tag={image_name}",
-            ".",
-        ],
-        check=True,
-    )
-
-    yield image_name
-
-    subprocess.run(
-        [
-            "gcloud",
-            "container",
-            "images",
-            "delete",
-            image_name,
-            f"--project={project}",
-            "--quiet",
-        ],
-        check=True,
-    )
-
-
-def dataflow_job_id_from_job_name(
-    job_name: str, project: str = PROJECT
-) -> Optional[str]:
-    from googleapiclient.discovery import build
-
-    dataflow = build("dataflow", "v1b3")
-
-    # Only return the 50 most recent results - our job is likely to be in here.
-    # If the job is not found, first try increasing this number.
-    # For more info see:
-    #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
-    jobs_request = (
-        dataflow.projects()
-        .jobs()
-        .list(
-            projectId=project,
-            filter="ACTIVE",
-            pageSize=50,
-        )
-    )
-    response = jobs_request.execute()
-
-    # Search for the job in the list that has our name (names are unique)
-    for job in response["jobs"]:
-        if job["name"] == job_name:
-            return job["id"]
-    return None
-
-
-def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
-    import backoff
-    from googleapiclient.discovery import build
-    from googleapiclient.errors import HttpError
-
-    dataflow = build("dataflow", "v1b3")
-
-    @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
-    def cancel():
-        # To cancel a dataflow job, we need its ID, not its name
-        job_id = dataflow_job_id_from_job_name(project, job_name)
-
-        if job_id is not None:
-            # Cancel the Dataflow job if it exists.
-            # If it doesn't, job_id will be equal to None.
-            # For more info, see:
-            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-            request = (
-                dataflow.projects()
-                .jobs()
-                .update(
-                    projectId=project,
-                    jobId=job_id,
-                    body={"requestedState": "JOB_STATE_CANCELLED"},
-                )
-            )
-            request.execute()
-
-    cancel()
-
-
-@staticmethod
-def dataflow_flex_template_build(
-    bucket_name: str,
-    template_image: str,
-    metadata_file: str,
-    project: str = PROJECT,
-    template_file: str = "template.json",
-) -> str:
-    from google.cloud import storage
-
-    subprocess.call(
-        [
-            "gcloud",
-            "dataflow",
-            "flex-template",
-            "build",
-            f"gs://{bucket_name}/{template_file}",
-            f"--project={project}",
-            f"--image={template_image}",
-            "--sdk-language=PYTHON",
-            f"--metadata-file={metadata_file}",
-        ],
-        check=True,
-    )
-
-    yield f"gs://{bucket_name}/{template_file}"
-
-    storage_client = storage.Client()
-    storage_client.bucket(bucket_name).blob(template_file).delete()
-
-
-def dataflow_flex_template_run(
-    job_name: str,
-    template_path: str,
-    bucket_name: str,
-    parameters: Dict[str, str] = {},
-    project: str = PROJECT,
-    region: str = REGION,
-) -> str:
-    unique_job_name = f"{job_name}-{UUID}"
-    subprocess.call(
-        [
-            "gcloud",
-            "dataflow",
-            "flex-template",
-            "run",
-            unique_job_name,
-            f"--template-file-gcs-location={template_path}",
-            f"--project={project}",
-            f"--region={region}",
-            f"--temp_location=gs://{bucket_name}/temp",
-        ]
-        + [f"--parameters={name}={value}" for name, value in parameters.items()],
-        check=True,
-    )
-
-    yield unique_job_name
+@pytest.fixture
+def utils() -> Utils:
+    return Utils()
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index 60683a1bfcf..bbc1cfd10f3 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -13,34 +13,37 @@
 import json
 import time
 
-import conftest as utils
+try:
+    from conftest import Utils
+except ModuleNotFoundError:
+    pass
 import pytest
 
 NAME = "dataflow-flex-templates-streaming-beam"
 
 
 @pytest.fixture(scope="session")
-def bucket_name() -> str:
+def bucket_name(utils: Utils) -> str:
     return utils.storage_bucket(NAME)
 
 
 @pytest.fixture(scope="session")
-def pubsub_topic() -> str:
+def pubsub_topic(utils: Utils) -> str:
     return utils.pubsub_topic(NAME)
 
 
 @pytest.fixture(scope="session")
-def pubsub_subscription(pubsub_topic: str) -> str:
+def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str:
     return utils.pubsub_subscription(pubsub_topic, NAME)
 
 
 @pytest.fixture(scope="session")
-def bigquery_dataset() -> str:
+def bigquery_dataset(utils: Utils) -> str:
     return utils.bigquery_dataset(NAME.replace("-", "_"))
 
 
 @pytest.fixture(scope="session")
-def pubsub_publisher(pubsub_topic: str) -> bool:
+def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
     return utils.pubsub_publisher(
         pubsub_topic,
         new_msg=lambda i: json.dumps(
@@ -53,12 +56,12 @@ def pubsub_publisher(pubsub_topic: str) -> bool:
 
 
 @pytest.fixture(scope="session")
-def flex_template_image() -> str:
+def flex_template_image(utils: Utils) -> str:
     return utils.container_image(NAME)
 
 
 @pytest.fixture(scope="session")
-def flex_template_path(bucket_name: str, flex_template_image: str) -> str:
+def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str:
     return utils.dataflow_flex_template_build(
         bucket_name=bucket_name,
         template_image=flex_template_image,
@@ -67,6 +70,7 @@ def flex_template_path(bucket_name: str, flex_template_image: str) -> str:
 
 
 def test_run_template(
+    utils: Utils,
     bucket_name: str,
     pubsub_publisher: str,
     pubsub_subscription: str,

From d1faf4abbc4c223a47d840c3b5e2f2e8a6c53247 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Thu, 29 Apr 2021 10:36:00 -0700
Subject: [PATCH 14/57] add notes

---
 dataflow/flex-templates/streaming_beam/e2e_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index bbc1cfd10f3..f408bb27394 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -16,7 +16,11 @@
 try:
     from conftest import Utils
 except ModuleNotFoundError:
-    pass
+    from typing import Any
+
+    # `conftest` cannot be imported when running in `nox`, but we still
+    # try to import it for the autocomplete when writing the tests.
+    Utils = Any
 import pytest
 
 NAME = "dataflow-flex-templates-streaming-beam"

From c7ff583be5dae6d456e0a556583bb0cd256ddb45 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Thu, 29 Apr 2021 13:41:13 -0700
Subject: [PATCH 15/57] make session scoped

---
 dataflow/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index a7e6f713419..5a83d7aacab 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -307,6 +307,6 @@ def dataflow_flex_template_run(
         yield unique_job_name
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def utils() -> Utils:
     return Utils()

From aefd54ccbc652b5efd1baf9b3ced638a9c1032d2 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 10:41:02 -0700
Subject: [PATCH 16/57] flex_template_run returns instead of yield

---
 dataflow/conftest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 5a83d7aacab..b3bc31e0776 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -279,6 +279,7 @@ def dataflow_flex_template_build(
         storage_client = storage.Client()
         storage_client.bucket(bucket_name).blob(template_file).delete()
 
+    @staticmethod
     def dataflow_flex_template_run(
         job_name: str,
         template_path: str,
@@ -304,7 +305,7 @@ def dataflow_flex_template_run(
             check=True,
         )
 
-        yield unique_job_name
+        return unique_job_name
 
 
 @pytest.fixture(scope="session")

From 1964725614632cfed631b4f8f63c4c1d6a73b916 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 10:43:04 -0700
Subject: [PATCH 17/57] document try imports

---
 dataflow/flex-templates/streaming_beam/e2e_test.py | 4 ++--
 dataflow/run_template/main_test.py                 | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index f408bb27394..afa2cb77ead 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -13,13 +13,13 @@
 import json
 import time
 
+# `conftest` cannot be imported when running in `nox`, but we still
+# try to import it for the autocomplete when writing the tests.
 try:
     from conftest import Utils
 except ModuleNotFoundError:
     from typing import Any
 
-    # `conftest` cannot be imported when running in `nox`, but we still
-    # try to import it for the autocomplete when writing the tests.
     Utils = Any
 import pytest
 
diff --git a/dataflow/run_template/main_test.py b/dataflow/run_template/main_test.py
index 1c20aeed1b4..d5d7eaf8195 100644
--- a/dataflow/run_template/main_test.py
+++ b/dataflow/run_template/main_test.py
@@ -31,7 +31,12 @@
 
 from werkzeug.urls import url_encode
 
-from . import main
+# Relative imports cannot be found when running in `nox`, but we still
+# try to import it for the autocomplete when writing the tests.
+try:
+    from . import main
+except ModuleNotFoundError:
+    import main
 
 
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds

From dd10c9aad5a1f6444cb20d7cd8560d1d99506c69 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 12:29:43 -0700
Subject: [PATCH 18/57] make dataclass

---
 dataflow/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index b3bc31e0776..63fb7654170 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -31,6 +31,7 @@
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
 
 
+@dataclass
 class Utils:
     uuid: str = UUID
     project: str = PROJECT

From 7b79e01630b9d5dba479caf777ec68d0e6d558ac Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 12:30:00 -0700
Subject: [PATCH 19/57] fix exception type

---
 dataflow/run_template/main_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataflow/run_template/main_test.py b/dataflow/run_template/main_test.py
index d5d7eaf8195..0887f1b7817 100644
--- a/dataflow/run_template/main_test.py
+++ b/dataflow/run_template/main_test.py
@@ -35,7 +35,7 @@
 # try to import it for the autocomplete when writing the tests.
 try:
     from . import main
-except ModuleNotFoundError:
+except ImportError:
     import main
 
 
From 2f7ca670f59639f5fabe7853a1728a2237eb85dc Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 12:32:15 -0700
Subject: [PATCH 20/57] make subprocess.run

---
 dataflow/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 63fb7654170..6abb859f035 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -260,7 +260,7 @@ def dataflow_flex_template_build(
     ) -> str:
         from google.cloud import storage
 
-        subprocess.call(
+        subprocess.run(
             [
                 "gcloud",
                 "dataflow",
@@ -290,7 +290,7 @@ def dataflow_flex_template_run(
         region: str = REGION,
     ) -> str:
         unique_job_name = f"{job_name}-{UUID}"
-        subprocess.call(
+        subprocess.run(
             [
                 "gcloud",
                 "dataflow",

From aaa25ea7e99c961facdb6c357e8da4655308afc2 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 15:04:46 -0700
Subject: [PATCH 21/57] use yield from

---
 dataflow/flex-templates/streaming_beam/e2e_test.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index afa2cb77ead..2ca3667c321 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -28,27 +28,27 @@
 
 @pytest.fixture(scope="session")
 def bucket_name(utils: Utils) -> str:
-    return utils.storage_bucket(NAME)
+    yield from utils.storage_bucket(NAME)
 
 
 @pytest.fixture(scope="session")
 def pubsub_topic(utils: Utils) -> str:
-    return utils.pubsub_topic(NAME)
+    yield from utils.pubsub_topic(NAME)
 
 
 @pytest.fixture(scope="session")
 def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str:
-    return utils.pubsub_subscription(pubsub_topic, NAME)
+    yield from utils.pubsub_subscription(pubsub_topic, NAME)
 
 
 @pytest.fixture(scope="session")
 def bigquery_dataset(utils: Utils) -> str:
-    return utils.bigquery_dataset(NAME.replace("-", "_"))
+    yield from utils.bigquery_dataset(NAME.replace("-", "_"))
 
 
 @pytest.fixture(scope="session")
 def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
-    return utils.pubsub_publisher(
+    yield from utils.pubsub_publisher(
         pubsub_topic,
         new_msg=lambda i: json.dumps(
             {
@@ -61,12 +61,12 @@ def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
 
 @pytest.fixture(scope="session")
 def flex_template_image(utils: Utils) -> str:
-    return utils.container_image(NAME)
+    yield from utils.container_image(NAME)
 
 
 @pytest.fixture(scope="session")
 def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str:
-    return utils.dataflow_flex_template_build(
+    yield from utils.dataflow_flex_template_build(
         bucket_name=bucket_name,
         template_image=flex_template_image,
         metadata_file="metadata.json",

From a33d9a339ad227d8202d67ca9c900c65ab69b06f Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 30 Apr 2021 15:52:48 -0700
Subject: [PATCH 22/57] use subprocess.run

---
 dataflow/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 6abb859f035..8806e0f9db0 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -88,7 +88,7 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
         # library throws an error upon deletion.
         # We use gcloud for a workaround. See also:
         # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-        subprocess.check_call(
+        subprocess.run(
             ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
             check=True,
         )
@@ -114,7 +114,7 @@ def pubsub_subscription(
         # library throws an error upon deletion.
         # We use gcloud for a workaround. See also:
         # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-        subprocess.check_call(
+        subprocess.run(
             [
                 "gcloud",
                 "pubsub",

From 49fa5275be454ba3e2424fa17151a9328fcbbd9e Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 3 May 2021 16:22:28 -0700
Subject: [PATCH 23/57] add more logging and fix tests

---
 dataflow/conftest.py                          | 98 ++++++++++++-------
 .../flex-templates/streaming_beam/e2e_test.py |  4 +-
 2 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 8806e0f9db0..ae1aa4b0773 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
 from dataclasses import dataclass
-from datetime import time
 import itertools
 import json
 import multiprocessing as mp
 import os
 import subprocess
+import time
 from typing import Any, Callable, Dict, Iterable, Optional
 import uuid
 
@@ -28,6 +28,7 @@
 REGION = "us-west1"
 ZONE = "us-west1-b"
 
+CONSOLE_URL = "https://console.cloud.google.com"
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
 
 
@@ -46,7 +47,8 @@ def storage_bucket(bucket_name: str) -> str:
         bucket_unique_name = f"{bucket_name}-{UUID}"
         bucket = storage_client.create_bucket(bucket_unique_name)
 
-        print(f"storage_bucket: {repr(bucket_unique_name)}")
+        print(f"storage_bucket: {bucket_unique_name}")
+        print(f"\t{CONSOLE_URL}/storage/browser/{bucket_unique_name}&project={PROJECT}")
         yield bucket_unique_name
 
         bucket.delete(force=True)
@@ -81,7 +83,10 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
         topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}")
         topic = publisher_client.create_topic(topic_path)
 
-        print(f"pubsub_topic: {repr(topic.name)}")
+        print(f"pubsub_topic: {topic.name}")
+        print(
+            f"\t{CONSOLE_URL}/cloudpubsub/topic/detail/{topic.name}&project={project}"
+        )
         yield topic.name
 
         # Due to the pinned library dependencies in apache-beam, client
@@ -107,7 +112,10 @@ def pubsub_subscription(
         )
         subscription = subscriber.create_subscription(subscription_path, topic_path)
 
-        print(f"pubsub_subscription: {repr(subscription.name)}")
+        print(f"pubsub_subscription: {subscription.name}")
+        print(
+            f"\t{CONSOLE_URL}/cloudpubsub/subscription/detail/{subscription.name}&project={project}"
+        )
         yield subscription.name
 
         # Due to the pinned library dependencies in apache-beam, client
@@ -122,7 +130,7 @@ def pubsub_subscription(
                 project,
                 "subscriptions",
                 "delete",
-                subscription_name,
+                subscription.name,
             ],
             check=True,
         )
@@ -139,18 +147,20 @@ def pubsub_publisher(
         def _infinite_publish_job() -> None:
             publisher_client = pubsub.PublisherClient()
             for i in itertools.count():
-                publisher_client.publish(
-                    topic_path, new_msg(i).encode("utf-8")
-                ).result()
+                msg = new_msg(i)
+                print(f">> publish[{i}]: {repr(msg)}")
+                publisher_client.publish(topic_path, msg.encode("utf-8")).result()
                 time.sleep(1)
 
         # Start a subprocess in the background to do the publishing.
+        print(f"Starting publisher on {topic_path}")
         p = mp.Process(target=_infinite_publish_job)
         p.start()
 
         yield p.is_alive()
 
         # For cleanup, terminate the background process.
+        print("Stopping publisher")
         p.join(timeout=0)
         p.terminate()
 
@@ -174,6 +184,10 @@ def container_image(
             check=True,
         )
 
+        print(f"container_image: {image_name}")
+        print(
+            f"\t{CONSOLE_URL}/gcr/images/{project}/GLOBAL/{image_path}?project={project}"
+        )
         yield image_name
 
         subprocess.run(
@@ -199,7 +213,7 @@ def dataflow_job_id_from_job_name(
         dataflow = build("dataflow", "v1b3")
 
         # Only return the 50 most recent results - our job is likely to be in here.
-        # If the job is not found, first try increasing this number.
+        # If the job is not found, first try increasing this number.[]''job_id
         # For more info see:
         #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
         jobs_request = (
@@ -220,7 +234,27 @@ def dataflow_job_id_from_job_name(
         return None
 
     @staticmethod
-    def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
+    def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None:
+        from googleapiclient.discovery import build
+
+        dataflow = build("dataflow", "v1b3")
+
+        # For more info, see:
+        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+        print(f"Canceling Dataflow job ID: {job_id}")
+        request = (
+            dataflow.projects()
+            .jobs()
+            .update(
+                projectId=project,
+                jobId=job_id,
+                body={"requestedState": "JOB_STATE_CANCELLED"},
+            )
+        )
+        request.execute()
+
+    @staticmethod
+    def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None:
         import backoff
         from googleapiclient.discovery import build
         from googleapiclient.errors import HttpError
@@ -229,24 +263,11 @@ def dataflow_jobs_cancel(job_name: str, project: str = PROJECT) -> None:
 
         @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
         def cancel():
-            # To cancel a dataflow job, we need its ID, not its name
+            # To cancel a dataflow job, we need its ID, not its name.
+            # If it doesn't, job_id will be equal to None.
             job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
-
             if job_id is not None:
-                # Cancel the Dataflow job if it exists.
-                # If it doesn't, job_id will be equal to None.
-                # For more info, see:
-                #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-                request = (
-                    dataflow.projects()
-                    .jobs()
-                    .update(
-                        projectId=project,
-                        jobId=job_id,
-                        body={"requestedState": "JOB_STATE_CANCELLED"},
-                    )
-                )
-                request.execute()
+                Utils.dataflow_jobs_cancel_by_job_id(job_id)
 
         cancel()
 
@@ -260,13 +281,14 @@ def dataflow_flex_template_build(
     ) -> str:
         from google.cloud import storage
 
-        subprocess.run(
+        template_gcs_path = f"gs://{bucket_name}/{template_file}"
+        p = subprocess.run(
             [
                 "gcloud",
                 "dataflow",
                 "flex-template",
                 "build",
-                f"gs://{bucket_name}/{template_file}",
+                template_gcs_path,
                 f"--project={project}",
                 f"--image={template_image}",
                 "--sdk-language=PYTHON",
@@ -275,7 +297,8 @@ def dataflow_flex_template_build(
             check=True,
         )
 
-        yield f"gs://{bucket_name}/{template_file}"
+        print(f"dataflow_flex_template_build: {template_gcs_path}")
+        yield template_gcs_path
 
         storage_client = storage.Client()
         storage_client.bucket(bucket_name).blob(template_file).delete()
@@ -290,7 +313,7 @@ def dataflow_flex_template_run(
         region: str = REGION,
     ) -> str:
         unique_job_name = f"{job_name}-{UUID}"
-        subprocess.run(
+        stdout = subprocess.run(
             [
                 "gcloud",
                 "dataflow",
@@ -304,9 +327,18 @@ def dataflow_flex_template_run(
             ]
             + [f"--parameters={name}={value}" for name, value in parameters.items()],
             check=True,
-        )
-
-        return unique_job_name
+            capture_output=True,
+        ).stdout.decode("utf-8")
+
+        print(f"Launched Dataflow template job: {unique_job_name}")
+        print(stdout)
+
+        try:
+            job_id = json.loads(stdout)["job_id"]
+            print(f"\t{CONSOLE_URL}/dataflow/jobs/{region}/{job_id}&project={project}")
+        except:
+            print(f"\t{CONSOLE_URL}/dataflow/jobs&project={project}")
+        return job_id
 
 
 @pytest.fixture(scope="session")
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index 2ca3667c321..c94ac7b2741 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -83,7 +83,7 @@ def test_run_template(
 ) -> None:
 
     bigquery_table = "output_table"
-    job_name = utils.dataflow_flex_template_run(
+    job_id = utils.dataflow_flex_template_run(
         job_name=NAME,
         template_path=flex_template_path,
         bucket_name=bucket_name,
@@ -96,7 +96,7 @@ def test_run_template(
     # Since this is a streaming job, it will never finish running.
     # Wait for 10 minutes, and then cancel the job.
     time.sleep(10 * 60)
-    utils.dataflow_jobs_cancel(job_name)
+    utils.dataflow_jobs_cancel_by_job_id(job_id)
 
     # Check for output data in BigQuery.
     query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}"

From 63abad2ae3e366a19e315db73da701a4b0b5e61b Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 3 May 2021 17:05:31 -0700
Subject: [PATCH 24/57] more fixes

---
 dataflow/conftest.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index ae1aa4b0773..b156da27555 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -94,7 +94,7 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
         # We use gcloud for a workaround. See also:
         # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
         subprocess.run(
-            ["gcloud", "pubsub", "--project", project, "topics", "delete", topic],
+            ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name],
             check=True,
         )
 
@@ -141,6 +141,7 @@ def pubsub_publisher(
         new_msg: Callable[[int], str] = lambda i: json.dumps(
             {"id": i, "content": f"message {i}"}
         ),
+        sleep_sec: int = 1,
     ) -> bool:
         from google.cloud import pubsub
 
@@ -148,9 +149,8 @@ def _infinite_publish_job() -> None:
             publisher_client = pubsub.PublisherClient()
             for i in itertools.count():
                 msg = new_msg(i)
-                print(f">> publish[{i}]: {repr(msg)}")
                 publisher_client.publish(topic_path, msg.encode("utf-8")).result()
-                time.sleep(1)
+                time.sleep(sleep_sec)
 
         # Start a subprocess in the background to do the publishing.
         print(f"Starting publisher on {topic_path}")
@@ -279,10 +279,8 @@ def dataflow_flex_template_build(
         project: str = PROJECT,
         template_file: str = "template.json",
     ) -> str:
-        from google.cloud import storage
-
         template_gcs_path = f"gs://{bucket_name}/{template_file}"
-        p = subprocess.run(
+        subprocess.run(
             [
                 "gcloud",
                 "dataflow",
@@ -299,9 +297,7 @@ def dataflow_flex_template_build(
 
         print(f"dataflow_flex_template_build: {template_gcs_path}")
         yield template_gcs_path
-
-        storage_client = storage.Client()
-        storage_client.bucket(bucket_name).blob(template_file).delete()
+        # The template file gets deleted when we delete the bucket.
 
     @staticmethod
     def dataflow_flex_template_run(

From f0314e506716643279749ee20f27c5bb9500061d Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 4 May 2021 11:25:58 -0700
Subject: [PATCH 25/57] add more logging

---
 dataflow/conftest.py | 138 +++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 71 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index b156da27555..b30af00ac57 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -93,10 +93,9 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
         # library throws an error upon deletion.
         # We use gcloud for a workaround. See also:
         # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-        subprocess.run(
-            ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name],
-            check=True,
-        )
+        cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name]
+        print(cmd)
+        subprocess.run(cmd, check=True)
 
     @staticmethod
     def pubsub_subscription(
@@ -122,18 +121,17 @@ def pubsub_subscription(
         # library throws an error upon deletion.
         # We use gcloud for a workaround. See also:
         # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
-        subprocess.run(
-            [
-                "gcloud",
-                "pubsub",
-                "--project",
-                project,
-                "subscriptions",
-                "delete",
-                subscription.name,
-            ],
-            check=True,
-        )
+        cmd = [
+            "gcloud",
+            "pubsub",
+            "--project",
+            project,
+            "subscriptions",
+            "delete",
+            subscription.name,
+        ]
+        print(cmd)
+        subprocess.run(cmd, check=True)
 
     @staticmethod
     def pubsub_publisher(
@@ -171,18 +169,19 @@ def container_image(
         tag: str = "latest",
     ) -> str:
         image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}"
-        subprocess.run(["gcloud", "auth", "configure-docker"], check=True)
-        subprocess.run(
-            [
-                "gcloud",
-                "builds",
-                "submit",
-                f"--project={project}",
-                f"--tag={image_name}",
-                ".",
-            ],
-            check=True,
-        )
+        cmd = ["gcloud", "auth", "configure-docker"]
+        print(cmd)
+        subprocess.run(cmd, check=True)
+        cmd = [
+            "gcloud",
+            "builds",
+            "submit",
+            f"--project={project}",
+            f"--tag={image_name}",
+            ".",
+        ]
+        print(cmd)
+        subprocess.run(cmd, check=True)
 
         print(f"container_image: {image_name}")
         print(
@@ -190,18 +189,17 @@ def container_image(
         )
         yield image_name
 
-        subprocess.run(
-            [
-                "gcloud",
-                "container",
-                "images",
-                "delete",
-                image_name,
-                f"--project={project}",
-                "--quiet",
-            ],
-            check=True,
-        )
+        cmd = [
+            "gcloud",
+            "container",
+            "images",
+            "delete",
+            image_name,
+            f"--project={project}",
+            "--quiet",
+        ]
+        print(cmd)
+        subprocess.run(cmd, check=True)
 
     @staticmethod
     def dataflow_job_id_from_job_name(
@@ -280,20 +278,19 @@ def dataflow_flex_template_build(
         template_file: str = "template.json",
     ) -> str:
         template_gcs_path = f"gs://{bucket_name}/{template_file}"
-        subprocess.run(
-            [
-                "gcloud",
-                "dataflow",
-                "flex-template",
-                "build",
-                template_gcs_path,
-                f"--project={project}",
-                f"--image={template_image}",
-                "--sdk-language=PYTHON",
-                f"--metadata-file={metadata_file}",
-            ],
-            check=True,
-        )
+        cmd = [
+            "gcloud",
+            "dataflow",
+            "flex-template",
+            "build",
+            template_gcs_path,
+            f"--project={project}",
+            f"--image={template_image}",
+            "--sdk-language=PYTHON",
+            f"--metadata-file={metadata_file}",
+        ]
+        print(cmd)
+        subprocess.run(cmd, check=True)
 
         print(f"dataflow_flex_template_build: {template_gcs_path}")
         yield template_gcs_path
@@ -309,25 +306,24 @@ def dataflow_flex_template_run(
         region: str = REGION,
     ) -> str:
         unique_job_name = f"{job_name}-{UUID}"
-        stdout = subprocess.run(
-            [
-                "gcloud",
-                "dataflow",
-                "flex-template",
-                "run",
-                unique_job_name,
-                f"--template-file-gcs-location={template_path}",
-                f"--project={project}",
-                f"--region={region}",
-                f"--temp_location=gs://{bucket_name}/temp",
-            ]
-            + [f"--parameters={name}={value}" for name, value in parameters.items()],
-            check=True,
-            capture_output=True,
-        ).stdout.decode("utf-8")
+        cmd = [
+            "gcloud",
+            "dataflow",
+            "flex-template",
+            "run",
+            unique_job_name,
+            f"--template-file-gcs-location={template_path}",
+            f"--project={project}",
+            f"--region={region}",
+            f"--temp_location=gs://{bucket_name}/temp",
+        ] + [f"--parameters={name}={value}" for name, value in parameters.items()]
+        print(cmd)
+        stdout = subprocess.run(cmd, check=True, capture_output=True).stdout.decode(
+            "utf-8"
+        )
+        print(stdout)
 
         print(f"Launched Dataflow template job: {unique_job_name}")
-        print(stdout)
 
         try:
             job_id = json.loads(stdout)["job_id"]

From 56359c5f817c110d3f49ca0057d9b034d3157d42 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 4 May 2021 11:28:10 -0700
Subject: [PATCH 26/57] print gcloud version

---
 dataflow/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index b30af00ac57..e35df34b3ae 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -335,4 +335,5 @@ def dataflow_flex_template_run(
 
 @pytest.fixture(scope="session")
 def utils() -> Utils:
+    subprocess.run(["gcloud", "--version"])
     return Utils()

From 9eb72ab7f27c3bdc59c4954cc85b1ad8243e7515 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 4 May 2021 11:44:24 -0700
Subject: [PATCH 27/57] add gcloudignore

---
 dataflow/flex-templates/streaming_beam/.gcloudignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 dataflow/flex-templates/streaming_beam/.gcloudignore

diff --git a/dataflow/flex-templates/streaming_beam/.gcloudignore b/dataflow/flex-templates/streaming_beam/.gcloudignore
new file mode 100644
index 00000000000..e6babcd3e1a
--- /dev/null
+++ b/dataflow/flex-templates/streaming_beam/.gcloudignore
@@ -0,0 +1,3 @@
+*
+!*.py
+!requirements.txt
\ No newline at end of file

From 295167d87a416016885b8d9b31cc40e25a9ad87a Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 4 May 2021 11:51:05 -0700
Subject: [PATCH 28/57] update gcloud and optimize docker build

---
 dataflow/conftest.py                                 | 5 ++++-
 dataflow/flex-templates/streaming_beam/.dockerignore | 4 ++++
 dataflow/flex-templates/streaming_beam/.gcloudignore | 1 +
 dataflow/flex-templates/streaming_beam/e2e_test.py   | 2 +-
 4 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 dataflow/flex-templates/streaming_beam/.dockerignore

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index e35df34b3ae..6cebfe01bf9 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -335,5 +335,8 @@ def dataflow_flex_template_run(
 
 @pytest.fixture(scope="session")
 def utils() -> Utils:
-    subprocess.run(["gcloud", "--version"])
+    # Some commands like `gcloud dataflow flex-template` are only available
+    # in the latest gcloud versions.
+    subprocess.run(["gcloud", "components", "update", "--quiet"], check=True)
+    subprocess.run(["gcloud", "version"])
     return Utils()
diff --git a/dataflow/flex-templates/streaming_beam/.dockerignore b/dataflow/flex-templates/streaming_beam/.dockerignore
new file mode 100644
index 00000000000..bd849df68f6
--- /dev/null
+++ b/dataflow/flex-templates/streaming_beam/.dockerignore
@@ -0,0 +1,4 @@
+# Ignore everything except for Python files and the requirements file.
+*
+!*.py
+!requirements.txt
\ No newline at end of file
diff --git a/dataflow/flex-templates/streaming_beam/.gcloudignore b/dataflow/flex-templates/streaming_beam/.gcloudignore
index e6babcd3e1a..bd849df68f6 100644
--- a/dataflow/flex-templates/streaming_beam/.gcloudignore
+++ b/dataflow/flex-templates/streaming_beam/.gcloudignore
@@ -1,3 +1,4 @@
+# Ignore everything except for Python files and the requirements file.
 *
 !*.py
 !requirements.txt
\ No newline at end of file
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index c94ac7b2741..0fa6d853dfb 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -73,7 +73,7 @@ def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str)
     )
 
 
-def test_run_template(
+def test_flex_template_run(
     utils: Utils,
     bucket_name: str,
     pubsub_publisher: str,

From 89809fe061b1495822010b4a68997bcbed31d533 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 4 May 2021 11:59:09 -0700
Subject: [PATCH 29/57] print gcloud version

---
 dataflow/conftest.py | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 6cebfe01bf9..51fceb59ad5 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -28,7 +28,6 @@
 REGION = "us-west1"
 ZONE = "us-west1-b"
 
-CONSOLE_URL = "https://console.cloud.google.com"
 RETRY_MAX_TIME = 5 * 60  # 5 minutes in seconds
 
 
@@ -48,7 +47,6 @@ def storage_bucket(bucket_name: str) -> str:
         bucket = storage_client.create_bucket(bucket_unique_name)
 
         print(f"storage_bucket: {bucket_unique_name}")
-        print(f"\t{CONSOLE_URL}/storage/browser/{bucket_unique_name}&project={PROJECT}")
         yield bucket_unique_name
 
         bucket.delete(force=True)
@@ -84,9 +82,6 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str:
         topic = publisher_client.create_topic(topic_path)
 
         print(f"pubsub_topic: {topic.name}")
-        print(
-            f"\t{CONSOLE_URL}/cloudpubsub/topic/detail/{topic.name}&project={project}"
-        )
         yield topic.name
 
         # Due to the pinned library dependencies in apache-beam, client
@@ -112,9 +107,6 @@ def pubsub_subscription(
         subscription = subscriber.create_subscription(subscription_path, topic_path)
 
         print(f"pubsub_subscription: {subscription.name}")
-        print(
-            f"\t{CONSOLE_URL}/cloudpubsub/subscription/detail/{subscription.name}&project={project}"
-        )
         yield subscription.name
 
         # Due to the pinned library dependencies in apache-beam, client
@@ -184,9 +176,6 @@ def container_image(
         subprocess.run(cmd, check=True)
 
         print(f"container_image: {image_name}")
-        print(
-            f"\t{CONSOLE_URL}/gcr/images/{project}/GLOBAL/{image_path}?project={project}"
-        )
         yield image_name
 
         cmd = [
@@ -322,21 +311,14 @@ def dataflow_flex_template_run(
             "utf-8"
         )
         print(stdout)
-
-        print(f"Launched Dataflow template job: {unique_job_name}")
-
-        try:
-            job_id = json.loads(stdout)["job_id"]
-            print(f"\t{CONSOLE_URL}/dataflow/jobs/{region}/{job_id}&project={project}")
-        except:
-            print(f"\t{CONSOLE_URL}/dataflow/jobs&project={project}")
-        return job_id
+        print(f"Launched Dataflow Flex Template job: {unique_job_name}")
+        return json.loads(stdout)["job_id"]
 
 
 @pytest.fixture(scope="session")
 def utils() -> Utils:
     # Some commands like `gcloud dataflow flex-template` are only available
     # in the latest gcloud versions.
-    subprocess.run(["gcloud", "components", "update", "--quiet"], check=True)
+    # subprocess.run(["gcloud", "components", "update", "--quiet"], check=True)
     subprocess.run(["gcloud", "version"])
     return Utils()

From 7e5b53203b2cc7f2a0ddda3f7dacc66660637bc7 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 10 May 2021 09:19:14 -0700
Subject: [PATCH 30/57] remove outdated comments

---
 dataflow/conftest.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 51fceb59ad5..f06b168237c 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -317,8 +317,5 @@ def dataflow_flex_template_run(
 
 @pytest.fixture(scope="session")
 def utils() -> Utils:
-    # Some commands like `gcloud dataflow flex-template` are only available
-    # in the latest gcloud versions.
-    # subprocess.run(["gcloud", "components", "update", "--quiet"], check=True)
     subprocess.run(["gcloud", "version"])
     return Utils()

From e5d5641d8eaa62930782168521bb7bca0f103fc0 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 10 May 2021 09:22:13 -0700
Subject: [PATCH 31/57] udpated test requirements

---
 dataflow/flex-templates/streaming_beam/requirements-test.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
index 6943b5160fa..01511732bed 100644
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt
@@ -1,4 +1,3 @@
-backoff==1.10.0
 google-api-python-client==2.1.0
 google-cloud-storage==1.38.0
 pytest-xdist==2.2.1

From 7ff556263a18eae92d5f49d5433d63a6a9244180 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 10 May 2021 09:39:29 -0700
Subject: [PATCH 32/57] include Dockerfile

---
 dataflow/flex-templates/streaming_beam/.gcloudignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dataflow/flex-templates/streaming_beam/.gcloudignore b/dataflow/flex-templates/streaming_beam/.gcloudignore
index bd849df68f6..594de3d29c8 100644
--- a/dataflow/flex-templates/streaming_beam/.gcloudignore
+++ b/dataflow/flex-templates/streaming_beam/.gcloudignore
@@ -1,4 +1,5 @@
 # Ignore everything except for Python files and the requirements file.
 *
+!Dockerfile
 !*.py
 !requirements.txt
\ No newline at end of file

From e5bdc4f92a754df4661e527a6db5697cfc91a3f9 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 10 May 2021 11:31:02 -0700
Subject: [PATCH 33/57] fix bigquery dataset names

---
 dataflow/conftest.py                               | 8 ++++++--
 dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index f06b168237c..4fb5b3b04ae 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -57,13 +57,15 @@ def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str:
 
         bigquery_client = bigquery.Client()
         dataset = bigquery_client.create_dataset(
-            bigquery.Dataset(f"{project}.{dataset_name}_{UUID}")
+            bigquery.Dataset(f"{project}.{dataset_name.replace('-', '_')}_{UUID}")
         )
 
         print(f"bigquery_dataset: {dataset.full_dataset_id}")
         yield dataset.full_dataset_id
 
-        bigquery_client.delete_dataset(dataset.full_dataset_id, delete_contents=True)
+        bigquery_client.delete_dataset(
+            dataset.full_dataset_id.replace(":", "."), delete_contents=True
+        )
 
     @staticmethod
     def bigquery_query(query: str) -> Iterable[Dict[str, Any]]:
@@ -295,6 +297,7 @@ def dataflow_flex_template_run(
         region: str = REGION,
     ) -> str:
         unique_job_name = f"{job_name}-{UUID}"
+        print(f"dataflow_job_name: {unique_job_name}")
         cmd = [
             "gcloud",
             "dataflow",
@@ -317,5 +320,6 @@ def dataflow_flex_template_run(
 
 @pytest.fixture(scope="session")
 def utils() -> Utils:
+    print(f"Test unique identifier: {UUID}")
     subprocess.run(["gcloud", "version"])
     return Utils()
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index 0fa6d853dfb..83d1bbc4e7e 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -43,7 +43,7 @@ def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str:
 
 @pytest.fixture(scope="session")
 def bigquery_dataset(utils: Utils) -> str:
-    yield from utils.bigquery_dataset(NAME.replace("-", "_"))
+    yield from utils.bigquery_dataset(NAME)
 
 
 @pytest.fixture(scope="session")

From e9da280a1e9d5af89c9d0b3fa55041460b93b2aa Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 10 May 2021 15:14:55 -0700
Subject: [PATCH 34/57] add debugging information on subprocess

---
 dataflow/conftest.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 4fb5b3b04ae..e155d976731 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -310,12 +310,24 @@ def dataflow_flex_template_run(
             f"--temp_location=gs://{bucket_name}/temp",
         ] + [f"--parameters={name}={value}" for name, value in parameters.items()]
         print(cmd)
-        stdout = subprocess.run(cmd, check=True, capture_output=True).stdout.decode(
-            "utf-8"
-        )
-        print(stdout)
-        print(f"Launched Dataflow Flex Template job: {unique_job_name}")
-        return json.loads(stdout)["job_id"]
+        try:
+            p = subprocess.run(cmd, check=True, capture_output=True)
+            stdout = p.stdout.decode("utf-8")
+            stderr = p.stdout.decode("utf-8")
+            print("--- stderr ---")
+            print(stderr.decode("utf-8"))
+            print("--- stdout ---")
+            print(stdout.decode("utf-8"))
+            print("--- end ---")
+            print(f"Launched Dataflow Flex Template job: {unique_job_name}")
+            return json.loads(stdout)["job_id"]
+        except subprocess.CalledProcessError as e:
+            print(e)
+            print("--- stderr ---")
+            print(e.stderr.decode("utf-8"))
+            print("--- stdout ---")
+            print(e.stdout.decode("utf-8"))
+            print("--- end ---")
 
 
 @pytest.fixture(scope="session")

From f2260a323e709fcf5f0a43b41ec2ef951fbd2f9c Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 10 May 2021 15:37:05 -0700
Subject: [PATCH 35/57] fix gcloud command

---
 dataflow/conftest.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index e155d976731..ce66f61e744 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -268,6 +268,7 @@ def dataflow_flex_template_build(
         project: str = PROJECT,
         template_file: str = "template.json",
     ) -> str:
+        # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/build
         template_gcs_path = f"gs://{bucket_name}/{template_file}"
         cmd = [
             "gcloud",
@@ -296,6 +297,7 @@ def dataflow_flex_template_run(
         project: str = PROJECT,
         region: str = REGION,
     ) -> str:
+        # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run
         unique_job_name = f"{job_name}-{UUID}"
         print(f"dataflow_job_name: {unique_job_name}")
         cmd = [
@@ -307,8 +309,13 @@ def dataflow_flex_template_run(
             f"--template-file-gcs-location={template_path}",
             f"--project={project}",
             f"--region={region}",
-            f"--temp_location=gs://{bucket_name}/temp",
-        ] + [f"--parameters={name}={value}" for name, value in parameters.items()]
+        ] + [
+            f"--parameters={name}={value}"
+            for name, value in {
+                **parameters,
+                "temp_location": f"gs://{bucket_name}/temp",
+            }.items()
+        ]
         print(cmd)
         try:
             p = subprocess.run(cmd, check=True, capture_output=True)

From 0765c3ca1d9d269295ccefccc7fd29d55501ac87 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 11 May 2021 10:59:22 -0700
Subject: [PATCH 36/57] remove redundant decode

---
 dataflow/conftest.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index ce66f61e744..2c7295995cc 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -319,15 +319,13 @@ def dataflow_flex_template_run(
         print(cmd)
         try:
             p = subprocess.run(cmd, check=True, capture_output=True)
-            stdout = p.stdout.decode("utf-8")
-            stderr = p.stdout.decode("utf-8")
             print("--- stderr ---")
-            print(stderr.decode("utf-8"))
+            print(p.stderr.decode("utf-8"))
             print("--- stdout ---")
-            print(stdout.decode("utf-8"))
+            print(p.stdout.decode("utf-8"))
             print("--- end ---")
             print(f"Launched Dataflow Flex Template job: {unique_job_name}")
-            return json.loads(stdout)["job_id"]
+            return json.loads(p.stdout.decode("utf-8"))["job_id"]
         except subprocess.CalledProcessError as e:
             print(e)
             print("--- stderr ---")

From 9b16ba64799d95f5ff4b79f8ccab87c3b6a006cd Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 11 May 2021 11:21:58 -0700
Subject: [PATCH 37/57] fix getting flex template job id

---
 dataflow/conftest.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 2c7295995cc..789786864cc 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -21,6 +21,7 @@
 import uuid
 
 import pytest
+import yaml
 
 # Default options.
 UUID = uuid.uuid4().hex[0:6]
@@ -245,11 +246,8 @@ def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None:
     @staticmethod
     def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None:
         import backoff
-        from googleapiclient.discovery import build
         from googleapiclient.errors import HttpError
 
-        dataflow = build("dataflow", "v1b3")
-
         @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
         def cancel():
             # To cancel a dataflow job, we need its ID, not its name.
@@ -325,7 +323,7 @@ def dataflow_flex_template_run(
             print(p.stdout.decode("utf-8"))
             print("--- end ---")
             print(f"Launched Dataflow Flex Template job: {unique_job_name}")
-            return json.loads(p.stdout.decode("utf-8"))["job_id"]
+            return yaml.safe_load(p.stdout.decode("utf-8"))["job"]["id"]
         except subprocess.CalledProcessError as e:
             print(e)
             print("--- stderr ---")

From 4de0d99bb59a538fbe17f42e3892c9b1304ed19e Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 11 May 2021 11:37:16 -0700
Subject: [PATCH 38/57] add pyyaml test depdendency

---
 dataflow/conftest.py                                         | 3 ++-
 dataflow/flex-templates/streaming_beam/requirements-test.txt | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 789786864cc..87d9643be38 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -21,7 +21,6 @@
 import uuid
 
 import pytest
-import yaml
 
 # Default options.
 UUID = uuid.uuid4().hex[0:6]
@@ -295,6 +294,8 @@ def dataflow_flex_template_run(
         project: str = PROJECT,
         region: str = REGION,
     ) -> str:
+        import yaml
+
         # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run
         unique_job_name = f"{job_name}-{UUID}"
         print(f"dataflow_job_name: {unique_job_name}")
diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
index 01511732bed..53cd9882b4e 100644
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt
@@ -2,3 +2,4 @@ google-api-python-client==2.1.0
 google-cloud-storage==1.38.0
 pytest-xdist==2.2.1
 pytest==6.2.4
+pyyaml==5.4.1
\ No newline at end of file

From 0a1a4ccb04046be610805841773bb2315158c9a0 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 11 May 2021 11:50:33 -0700
Subject: [PATCH 39/57] use stdout/stderr instead of capture_output

---
 dataflow/conftest.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 87d9643be38..ea9e4d1ad4d 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -317,7 +317,12 @@ def dataflow_flex_template_run(
         ]
         print(cmd)
         try:
-            p = subprocess.run(cmd, check=True, capture_output=True)
+            # The `capture_output` option was added in Python 3.7, so we must
+            # pass the `stdout` and `stderr` options explicitly to support 3.6.
+            # https://docs.python.org/3/library/subprocess.html#subprocess.run
+            p = subprocess.run(
+                cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
             print("--- stderr ---")
             print(p.stderr.decode("utf-8"))
             print("--- stdout ---")

From b4392ee0724b4416a5152e1a996c76e9f262c833 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 11 May 2021 11:55:15 -0700
Subject: [PATCH 40/57] reorganized error handling

---
 dataflow/conftest.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index ea9e4d1ad4d..d2b3d4cb1b6 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -16,6 +16,7 @@
 import multiprocessing as mp
 import os
 import subprocess
+import sys
 import time
 from typing import Any, Callable, Dict, Iterable, Optional
 import uuid
@@ -323,20 +324,20 @@ def dataflow_flex_template_run(
             p = subprocess.run(
                 cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
             )
-            print("--- stderr ---")
-            print(p.stderr.decode("utf-8"))
-            print("--- stdout ---")
-            print(p.stdout.decode("utf-8"))
-            print("--- end ---")
+            stdout = p.stdout.decode("utf-8")
+            stderr = p.stderr.decode("utf-8")
             print(f"Launched Dataflow Flex Template job: {unique_job_name}")
-            return yaml.safe_load(p.stdout.decode("utf-8"))["job"]["id"]
         except subprocess.CalledProcessError as e:
-            print(e)
+            print(e, file=sys.stderr)
+            stdout = stdout.decode("utf-8")
+            stderr = stderr.decode("utf-8")
+        finally:
             print("--- stderr ---")
-            print(e.stderr.decode("utf-8"))
+            print(stderr)
             print("--- stdout ---")
-            print(e.stdout.decode("utf-8"))
+            print(stdout)
             print("--- end ---")
+        return yaml.safe_load(stdout)["job"]["id"]
 
 
 @pytest.fixture(scope="session")

From 669fbf200aa745895877f993e4604c66310ab98d Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 11 May 2021 15:00:52 -0700
Subject: [PATCH 41/57] retry cancel

---
 dataflow/conftest.py                          | 47 +++++++++----------
 .../streaming_beam/requirements-test.txt      |  1 +
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index d2b3d4cb1b6..5262837ac25 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -225,38 +225,37 @@ def dataflow_job_id_from_job_name(
 
     @staticmethod
     def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None:
+        import backoff
         from googleapiclient.discovery import build
+        from googleapiclient.errors import HttpError
 
         dataflow = build("dataflow", "v1b3")
 
-        # For more info, see:
-        #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-        print(f"Canceling Dataflow job ID: {job_id}")
-        request = (
-            dataflow.projects()
-            .jobs()
-            .update(
-                projectId=project,
-                jobId=job_id,
-                body={"requestedState": "JOB_STATE_CANCELLED"},
+        @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
+        def cancel_job():
+            # For more info, see:
+            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
+            print(f"Canceling Dataflow job ID: {job_id}")
+            request = (
+                dataflow.projects()
+                .jobs()
+                .update(
+                    projectId=project,
+                    jobId=job_id,
+                    body={"requestedState": "JOB_STATE_CANCELLED"},
+                )
             )
-        )
-        request.execute()
+            request.execute()
+
+        cancel_job()
 
     @staticmethod
     def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None:
-        import backoff
-        from googleapiclient.errors import HttpError
-
-        @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
-        def cancel():
-            # To cancel a dataflow job, we need its ID, not its name.
-            # If it doesn't, job_id will be equal to None.
-            job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
-            if job_id is not None:
-                Utils.dataflow_jobs_cancel_by_job_id(job_id)
-
-        cancel()
+        # To cancel a dataflow job, we need its ID, not its name.
+        # If it doesn't, job_id will be equal to None.
+        job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
+        if job_id is not None:
+            Utils.dataflow_jobs_cancel_by_job_id(job_id)
 
     @staticmethod
     def dataflow_flex_template_build(
diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
index 53cd9882b4e..bb00b0295a1 100644
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt
@@ -1,3 +1,4 @@
+backoff==1.10.0
 google-api-python-client==2.1.0
 google-cloud-storage==1.38.0
 pytest-xdist==2.2.1

From dab62185a933d51cc532105edd95ea4a5a5c0a16 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 12 May 2021 13:39:17 -0700
Subject: [PATCH 42/57] cancel dataflow job with region

---
 dataflow/conftest.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 5262837ac25..2802f449908 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -224,7 +224,9 @@ def dataflow_job_id_from_job_name(
         return None
 
     @staticmethod
-    def dataflow_jobs_cancel_by_job_id(job_id: str, project: str = PROJECT) -> None:
+    def dataflow_jobs_cancel_by_job_id(
+        job_id: str, project: str = PROJECT, region: str = REGION
+    ) -> None:
         import backoff
         from googleapiclient.discovery import build
         from googleapiclient.errors import HttpError
@@ -242,6 +244,7 @@ def cancel_job():
                 .update(
                     projectId=project,
                     jobId=job_id,
+                    location=region,
                     body={"requestedState": "JOB_STATE_CANCELLED"},
                 )
             )
@@ -250,12 +253,14 @@ def cancel_job():
         cancel_job()
 
     @staticmethod
-    def dataflow_jobs_cancel_by_job_name(job_name: str, project: str = PROJECT) -> None:
+    def dataflow_jobs_cancel_by_job_name(
+        job_name: str, project: str = PROJECT, region: str = REGION
+    ) -> None:
         # To cancel a dataflow job, we need its ID, not its name.
         # If it doesn't, job_id will be equal to None.
         job_id = Utils.dataflow_job_id_from_job_name(project, job_name)
         if job_id is not None:
-            Utils.dataflow_jobs_cancel_by_job_id(job_id)
+            Utils.dataflow_jobs_cancel_by_job_id(job_id, project, region)
 
     @staticmethod
     def dataflow_flex_template_build(

From c5929c70e3d2d94626f9f6456a9f0b80e55fc39e Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 12 May 2021 14:49:57 -0700
Subject: [PATCH 43/57] change cancel to gcloud

---
 dataflow/conftest.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 2802f449908..4d49016d9fa 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -238,17 +238,27 @@ def cancel_job():
             # For more info, see:
             #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
             print(f"Canceling Dataflow job ID: {job_id}")
-            request = (
-                dataflow.projects()
-                .jobs()
-                .update(
-                    projectId=project,
-                    jobId=job_id,
-                    location=region,
-                    body={"requestedState": "JOB_STATE_CANCELLED"},
-                )
-            )
-            request.execute()
+            # request = (
+            #     dataflow.projects()
+            #     .jobs()
+            #     .update(
+            #         projectId=project,
+            #         jobId=job_id,
+            #         location=region,
+            #         body={"requestedState": "JOB_STATE_CANCELLED"},
+            #     )
+            # )
+            # request.execute()
+            cmd = [
+                "gcloud",
+                f"--project={project}",
+                "dataflow",
+                "jobs",
+                "cancel",
+                job_id,
+                f"--region={region}",
+            ]
+            subprocess.run(cmd, check=True)
 
         cancel_job()
 

From ad1861cff348cd53074a9cca85be90aa3cccca9e Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 12 May 2021 15:20:30 -0700
Subject: [PATCH 44/57] simplify testing functions

---
 dataflow/conftest.py                          | 49 ++++++-------------
 .../streaming_beam/requirements-test.txt      |  2 -
 2 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 4d49016d9fa..ccdbc4ad7e5 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -227,40 +227,21 @@ def dataflow_job_id_from_job_name(
     def dataflow_jobs_cancel_by_job_id(
         job_id: str, project: str = PROJECT, region: str = REGION
     ) -> None:
-        import backoff
-        from googleapiclient.discovery import build
-        from googleapiclient.errors import HttpError
-
-        dataflow = build("dataflow", "v1b3")
-
-        @backoff.on_exception(backoff.expo, HttpError, max_time=RETRY_MAX_TIME)
-        def cancel_job():
-            # For more info, see:
-            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
-            print(f"Canceling Dataflow job ID: {job_id}")
-            # request = (
-            #     dataflow.projects()
-            #     .jobs()
-            #     .update(
-            #         projectId=project,
-            #         jobId=job_id,
-            #         location=region,
-            #         body={"requestedState": "JOB_STATE_CANCELLED"},
-            #     )
-            # )
-            # request.execute()
-            cmd = [
-                "gcloud",
-                f"--project={project}",
-                "dataflow",
-                "jobs",
-                "cancel",
-                job_id,
-                f"--region={region}",
-            ]
-            subprocess.run(cmd, check=True)
-
-        cancel_job()
+        print(f"Canceling Dataflow job ID: {job_id}")
+        # We get an error using the googleapiclient.discovery APIs, probably
+        # due to incompatible dependencies with apache-beam.
+        # We use gcloud instead to cancel the job.
+        #   https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel
+        cmd = [
+            "gcloud",
+            f"--project={project}",
+            "dataflow",
+            "jobs",
+            "cancel",
+            job_id,
+            f"--region={region}",
+        ]
+        subprocess.run(cmd, check=True)
 
     @staticmethod
     def dataflow_jobs_cancel_by_job_name(
diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
index bb00b0295a1..3a98275ace3 100644
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt
@@ -1,5 +1,3 @@
-backoff==1.10.0
-google-api-python-client==2.1.0
 google-cloud-storage==1.38.0
 pytest-xdist==2.2.1
 pytest==6.2.4

From 6a81184df743be00a18999a1a5946cfc44d32a7b Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:16:02 -0700
Subject: [PATCH 45/57] Update dataflow/__init__.py

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
---
 dataflow/__init__.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/dataflow/__init__.py b/dataflow/__init__.py
index ffc78f34e19..8b137891791 100644
--- a/dataflow/__init__.py
+++ b/dataflow/__init__.py
@@ -1,11 +1 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+

From 212a5ca99822c4bf97ed00a84a4bbd3b72b3acb6 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:16:10 -0700
Subject: [PATCH 46/57] Update
 dataflow/flex-templates/streaming_beam/__init__.py

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
---
 dataflow/flex-templates/streaming_beam/__init__.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/dataflow/flex-templates/streaming_beam/__init__.py b/dataflow/flex-templates/streaming_beam/__init__.py
index ffc78f34e19..8b137891791 100644
--- a/dataflow/flex-templates/streaming_beam/__init__.py
+++ b/dataflow/flex-templates/streaming_beam/__init__.py
@@ -1,11 +1 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+

From 9085b1d8c24c182dc0237b09443845904bb1abb9 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:16:21 -0700
Subject: [PATCH 47/57] Update dataflow/flex-templates/__init__.py

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
---
 dataflow/flex-templates/__init__.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py
index ffc78f34e19..8b137891791 100644
--- a/dataflow/flex-templates/__init__.py
+++ b/dataflow/flex-templates/__init__.py
@@ -1,11 +1 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+

From 0fdc12714e8d5a2326f4746523ffda5e23f91a49 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:17:38 -0700
Subject: [PATCH 48/57] Update
 dataflow/flex-templates/streaming_beam/noxfile_config.py

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
---
 dataflow/flex-templates/streaming_beam/noxfile_config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dataflow/flex-templates/streaming_beam/noxfile_config.py b/dataflow/flex-templates/streaming_beam/noxfile_config.py
index 79bccdd3e16..9d0a10cec91 100644
--- a/dataflow/flex-templates/streaming_beam/noxfile_config.py
+++ b/dataflow/flex-templates/streaming_beam/noxfile_config.py
@@ -32,6 +32,10 @@
     # to use your own Cloud project.
     "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
     # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+    # If you need to use a specific version of pip,
+    # change pip_version_override to the string representation
+    # of the version number, for example, "20.2.4"
+    "pip_version_override": None,
     # A dictionary you want to inject into your test. Don't put any
     # secrets here. These values will override predefined values.
     "envs": {

From 29f7d628c2072972c5bc764afe43c1020d14d162 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:25:11 -0700
Subject: [PATCH 49/57] Update __init__.py


From 20deb574faab28ab76294bc2d53fbc791011bf95 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:54:39 -0700
Subject: [PATCH 50/57] Make __init__.py empty


From 74b8779e1c8607af10ac03fff7cedbf8309cb099 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Fri, 21 May 2021 11:57:23 -0700
Subject: [PATCH 51/57] make __init__ files actually empty

---
 dataflow/__init__.py                               | 1 -
 dataflow/flex-templates/streaming_beam/__init__.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/dataflow/__init__.py b/dataflow/__init__.py
index 8b137891791..e69de29bb2d 100644
--- a/dataflow/__init__.py
+++ b/dataflow/__init__.py
@@ -1 +0,0 @@
-
diff --git a/dataflow/flex-templates/streaming_beam/__init__.py b/dataflow/flex-templates/streaming_beam/__init__.py
index 8b137891791..e69de29bb2d 100644
--- a/dataflow/flex-templates/streaming_beam/__init__.py
+++ b/dataflow/flex-templates/streaming_beam/__init__.py
@@ -1 +0,0 @@
-

From e2cd129ce729f842a9e330afcd77fdd746e9d4be Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 1 Jun 2021 16:53:53 -0700
Subject: [PATCH 52/57] wait for job before cancel

---
 dataflow/conftest.py                          | 25 +++++++++++++++++++
 .../flex-templates/streaming_beam/e2e_test.py |  9 ++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index ccdbc4ad7e5..e402554023f 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -223,6 +223,31 @@ def dataflow_job_id_from_job_name(
                 return job["id"]
         return None
 
+    @staticmethod
+    def dataflow_jobs_wait(
+        job_id: str,
+        project: str = PROJECT,
+        status: str = "JOB_STATE_RUNNING",
+    ) -> bool:
+        from googleapiclient.discovery import build
+
+        dataflow = build("dataflow", "v1b3")
+
+        sleep_time_seconds = 30
+        max_sleep_time = 10 * 60
+        print(f"Waiting for Dataflow job ID: {job_id} (until status {status})")
+        for _ in range(0, max_sleep_time, sleep_time_seconds):
+            # For more info see:
+            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get
+            jobs_request = (
+                dataflow.projects().jobs().get(projectId=project, jobId=job_id)
+            )
+            response = jobs_request.execute()
+            if response["currentState"] == status:
+                return True
+            time.sleep(30)
+        return False
+
     @staticmethod
     def dataflow_jobs_cancel_by_job_id(
         job_id: str, project: str = PROJECT, region: str = REGION
diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py
index 83d1bbc4e7e..e642306ed4b 100644
--- a/dataflow/flex-templates/streaming_beam/e2e_test.py
+++ b/dataflow/flex-templates/streaming_beam/e2e_test.py
@@ -94,11 +94,14 @@ def test_flex_template_run(
     )
 
     # Since this is a streaming job, it will never finish running.
-    # Wait for 10 minutes, and then cancel the job.
-    time.sleep(10 * 60)
+    # First, lets wait until the job is running.
+    utils.dataflow_jobs_wait(job_id)
+
+    # Then, wait a minute for data to arrive, get processed, and cancel it.
+    time.sleep(60)
     utils.dataflow_jobs_cancel_by_job_id(job_id)
 
-    # Check for output data in BigQuery.
+    # Check for the output data in BigQuery.
     query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}"
     rows = list(utils.bigquery_query(query))
     assert len(rows) > 0

From 0bd17076b0d9b487854ee768d289d26d5db6d162 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 1 Jun 2021 17:56:05 -0700
Subject: [PATCH 53/57] add api client libraries

---
 dataflow/flex-templates/streaming_beam/requirements-test.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dataflow/flex-templates/streaming_beam/requirements-test.txt b/dataflow/flex-templates/streaming_beam/requirements-test.txt
index 3a98275ace3..53cd9882b4e 100644
--- a/dataflow/flex-templates/streaming_beam/requirements-test.txt
+++ b/dataflow/flex-templates/streaming_beam/requirements-test.txt
@@ -1,3 +1,4 @@
+google-api-python-client==2.1.0
 google-cloud-storage==1.38.0
 pytest-xdist==2.2.1
 pytest==6.2.4

From 373ffb18717ac7074baf32d3eb7335b5ca230c73 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 1 Jun 2021 19:18:31 -0700
Subject: [PATCH 54/57] sleep before waiting for job

---
 dataflow/conftest.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index e402554023f..daad3531caf 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -235,6 +235,12 @@ def dataflow_jobs_wait(
 
         sleep_time_seconds = 30
         max_sleep_time = 10 * 60
+
+        # It takes a couple seconds for the job_id to be findable by the API client.
+        # Sleep for a small duration initially to wait until we can access the job
+        # from the client library.
+        time.sleep(sleep_time_seconds)
+
         print(f"Waiting for Dataflow job ID: {job_id} (until status {status})")
         for _ in range(0, max_sleep_time, sleep_time_seconds):
             # For more info see:
@@ -245,7 +251,7 @@ def dataflow_jobs_wait(
             response = jobs_request.execute()
             if response["currentState"] == status:
                 return True
-            time.sleep(30)
+            time.sleep(sleep_time_seconds)
         return False
 
     @staticmethod

From ca118e11d4dd0deec2dc53ea2781b815db4ef194 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Tue, 1 Jun 2021 19:38:25 -0700
Subject: [PATCH 55/57] add more logging

---
 dataflow/conftest.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index daad3531caf..6beebdeea42 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -246,9 +246,12 @@ def dataflow_jobs_wait(
             # For more info see:
             #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get
             jobs_request = (
-                dataflow.projects().jobs().get(projectId=project, jobId=job_id)
+                dataflow.projects()
+                .jobs()
+                .get(projectId=project, jobId=job_id, jobView="JOB_VIEW_SUMMARY")
             )
             response = jobs_request.execute()
+            print(response)
             if response["currentState"] == status:
                 return True
             time.sleep(sleep_time_seconds)

From a07b2f6f4849c54932b1a8db9a5e0d3b18c1291a Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 2 Jun 2021 10:08:26 -0700
Subject: [PATCH 56/57] fix parameter name

---
 dataflow/conftest.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 6beebdeea42..8473402c53d 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -248,7 +248,11 @@ def dataflow_jobs_wait(
             jobs_request = (
                 dataflow.projects()
                 .jobs()
-                .get(projectId=project, jobId=job_id, jobView="JOB_VIEW_SUMMARY")
+                .get(
+                    projectId=project,
+                    jobId=job_id,
+                    view="JOB_VIEW_SUMMARY",
+                )
             )
             response = jobs_request.execute()
             print(response)

From 6c9bcc329f113dcf0084cc5a4554b030f1afa06d Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 2 Jun 2021 11:23:48 -0700
Subject: [PATCH 57/57] surround wait in try-except

---
 dataflow/conftest.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/dataflow/conftest.py b/dataflow/conftest.py
index 8473402c53d..13314bf86dd 100644
--- a/dataflow/conftest.py
+++ b/dataflow/conftest.py
@@ -236,28 +236,26 @@ def dataflow_jobs_wait(
         sleep_time_seconds = 30
         max_sleep_time = 10 * 60
 
-        # It takes a couple seconds for the job_id to be findable by the API client.
-        # Sleep for a small duration initially to wait until we can access the job
-        # from the client library.
-        time.sleep(sleep_time_seconds)
-
         print(f"Waiting for Dataflow job ID: {job_id} (until status {status})")
         for _ in range(0, max_sleep_time, sleep_time_seconds):
-            # For more info see:
-            #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get
-            jobs_request = (
-                dataflow.projects()
-                .jobs()
-                .get(
-                    projectId=project,
-                    jobId=job_id,
-                    view="JOB_VIEW_SUMMARY",
+            try:
+                # For more info see:
+                #   https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get
+                jobs_request = (
+                    dataflow.projects()
+                    .jobs()
+                    .get(
+                        projectId=project,
+                        jobId=job_id,
+                        view="JOB_VIEW_SUMMARY",
+                    )
                 )
-            )
-            response = jobs_request.execute()
-            print(response)
-            if response["currentState"] == status:
-                return True
+                response = jobs_request.execute()
+                print(response)
+                if response["currentState"] == status:
+                    return True
+            except:
+                pass
             time.sleep(sleep_time_seconds)
         return False