Lab: test suite

Write a complete test suite for a pipeline script — covering config loading, data transformation, mocked HTTP, file output, and error paths.

A pipeline without tests is a time bomb. This lab provides a small but realistic pipeline module and guides you through writing five tests — one for each layer that matters. By the end, every function in the module has at least one test, and every error path is covered.

The pipeline module under test

Read through the module carefully before writing tests. Note:

load_config reads from a dict (simulating os.environ) and raises on missing keys.
transform_records is pure — it only computes.
fetch_records calls an external URL.
write_report writes JSON to disk.
run is the orchestrator — it calls all four in sequence.

Python — editable, runs in your browser

import json, os, tempfile, urllib.request, urllib.error
from pathlib import Path
from unittest.mock import patch, MagicMock

# ════════════════════════════════════════════════════════════════════════════
# pipeline.py  (the module under test — defined inline for the demo)
# ════════════════════════════════════════════════════════════════════════════

def load_config(env: dict) -> dict:
  """Load and validate config from an env-dict. Raises KeyError on missing keys."""
  required = ["API_URL", "OUTPUT_DIR"]
  missing = [k for k in required if k not in env]
  if missing:
      raise KeyError(f"Missing required config keys: {missing}")
  return {
      "api_url": env["API_URL"],
      "output_dir": Path(env["OUTPUT_DIR"]),
      "retries": int(env.get("RETRIES", "3")),
  }

def transform_records(records: list) -> list:
  """Normalise records: convert value to int, drop records with value < 0."""
  result = []
  for r in records:
      val = int(r["value"])
      if val >= 0:
          result.append({"id": r["id"], "value": val})
  return result

def fetch_records(url: str) -> list:
  """Fetch JSON array from url. Raises RuntimeError on non-200."""
  try:
      with urllib.request.urlopen(url, timeout=10) as resp:
          return json.loads(resp.read().decode())
  except urllib.error.HTTPError as exc:
      raise RuntimeError(f"HTTP {exc.code}: {url}") from exc

def write_report(records: list, output_dir: Path) -> Path:
  """Write records as JSON to output_dir/report.json. Returns the path."""
  output_dir.mkdir(parents=True, exist_ok=True)
  dest = output_dir / "report.json"
  dest.write_text(json.dumps(records, indent=2))
  return dest

def run(env: dict) -> Path:
  config = load_config(env)
  raw = fetch_records(config["api_url"])
  cleaned = transform_records(raw)
  return write_report(cleaned, config["output_dir"])

# ════════════════════════════════════════════════════════════════════════════
# tests  (normally in test_pipeline.py)
# ════════════════════════════════════════════════════════════════════════════

def assert_equal(actual, expected, label):
  if actual != expected:
      print(f"FAIL [{label}]: expected {expected!r}, got {actual!r}")
  else:
      print(f"PASS [{label}]")

# ── Test 1: config loading happy path ────────────────────────────────────────
def test_load_config_happy():
  config = load_config({
      "API_URL": "https://api.example.com/data",
      "OUTPUT_DIR": "/tmp/out",
      "RETRIES": "5",
  })
  assert_equal(config["api_url"], "https://api.example.com/data", "api_url")
  assert_equal(config["retries"], 5, "retries parsed as int")

# ── Test 2: config loading raises on missing key ────────────────────────────
def test_load_config_missing_key():
  try:
      load_config({"API_URL": "https://api.example.com/data"})  # OUTPUT_DIR missing
      print("FAIL [missing key]: expected KeyError")
  except KeyError as exc:
      assert "OUTPUT_DIR" in str(exc)
      print(f"PASS [missing key]: KeyError raised with {exc}")

# ── Test 3: transform_records filters and coerces ────────────────────────────
def test_transform_records():
  raw = [
      {"id": "a", "value": "10"},
      {"id": "b", "value": "-5"},   # should be dropped
      {"id": "c", "value": "0"},
      {"id": "d", "value": "  7"}, # leading space, still valid
  ]
  result = transform_records(raw)
  assert_equal(len(result), 3, "negative record dropped")
  assert_equal(result[0]["value"], 10, "value coerced to int")
  assert_equal(result[2]["value"], 7, "whitespace-padded value coerced")

# ── Test 4: fetch_records with mocked HTTP ───────────────────────────────────
def test_fetch_records_mocked():
  fake_data = [{"id": "x", "value": "1"}]

class FakeResp:
      def read(self): return json.dumps(fake_data).encode()
      def __enter__(self): return self
      def __exit__(self, *_): pass

with patch("urllib.request.urlopen", return_value=FakeResp()) as mock_call:
      result = fetch_records("https://api.example.com/data")

assert_equal(result, fake_data, "returns parsed JSON")
  mock_call.assert_called_once_with("https://api.example.com/data", timeout=10)
  print("PASS [fetch URL asserted]")

# ── Test 5: write_report creates file with correct content ───────────────────
def test_write_report_file_contents():
  records = [{"id": "a", "value": 42}]
  with tempfile.TemporaryDirectory() as tmp:
      dest = write_report(records, Path(tmp))
      assert dest.exists(), "report.json not created"
      loaded = json.loads(dest.read_text())
      assert_equal(loaded, records, "file contents match records")

# ── Test 6: error path — fetch_records raises RuntimeError on 500 ────────────
def test_fetch_records_500():
  http_error = urllib.error.HTTPError(
      url="https://api.example.com/data",
      code=500, msg="Server Error", hdrs={}, fp=None,
  )
  with patch("urllib.request.urlopen", side_effect=http_error):
      try:
          fetch_records("https://api.example.com/data")
          print("FAIL [500 path]: expected RuntimeError")
      except RuntimeError as exc:
          assert "500" in str(exc)
          print(f"PASS [500 path]: RuntimeError raised: {exc}")

# ── Run all tests ────────────────────────────────────────────────────────────
print("=== Running test suite ===")
test_load_config_happy()
test_load_config_missing_key()
test_transform_records()
test_fetch_records_mocked()
test_write_report_file_contents()
test_fetch_records_500()
print()
print("All tests complete.")

Checkpoint 1 — break a test intentionally

Change the transform_records function so it does not drop negative values. Run the tests. test_transform_records should fail with a clear message. Restore the filter and confirm all tests pass again.

This is the most important habit: verify that tests actually catch the bug they claim to catch, not just that they pass when the code is correct.

Checkpoint 2 — test the orchestrator

run calls four functions in sequence. Add a test that:

Mocks fetch_records to return a list of two records.
Uses a tmp_path-style temporary directory for OUTPUT_DIR.
Calls run(env) with appropriate env values.
Asserts the output file exists and contains the transformed data.

This is an integration test within the module — it exercises all four functions together without hitting the real network.

An integration test that mocks only the network boundary (not the filesystem) is often the most valuable test you can write. It proves that the pieces fit together, not just that each piece works in isolation.

Checkpoint 3 — add a missing-file error path

Modify write_report to raise PermissionError if output_dir is /root (always unwritable on a standard Linux system). Write a test that:

Calls write_report(records, Path("/root")).
Asserts the PermissionError is raised.

Then revert the change. The point is to practise writing a test before writing the code — red first, then green.

Converting to real pytest

When you run these as a proper pytest suite, replace the manual if/else checks with assert statements and let pytest handle the output formatting:

def test_transform_records():
    raw = [{"id": "a", "value": "10"}, {"id": "b", "value": "-5"}]
    result = transform_records(raw)
    assert len(result) == 1
    assert result[0]["value"] == 10

def test_write_report_file_contents(tmp_path):
    records = [{"id": "a", "value": 42}]
    dest = write_report(records, tmp_path)
    assert dest.exists()
    assert json.loads(dest.read_text()) == records

Note how tmp_path comes in as a pytest fixture — no tempfile.TemporaryDirectory context manager needed.

Where to go next

Module complete. Next up: Workflow Orchestration — once your pipelines are tested and hardened, the next step is expressing their dependencies explicitly as DAGs and running them with a proper orchestration tool.

Finished reading? Mark it complete to track your progress.