Google Vertex AI

Google Vertex AI is GCP's unified ML platform, combining Google's AutoML capabilities with custom training infrastructure. It represents Google's consolidation of previously separate ML services (AI Platform, AutoML Vision, etc.) into a single, cohesive platform.

Vertex AI vs SageMaker Philosophy

While SageMaker emphasizes flexibility (bring your own container/script), Vertex AI emphasizes simplicity and automation. Vertex AI's AutoML can train production-quality models with zero ML code, while its custom training supports any framework. This dual-track approach makes it approachable for beginners and powerful for experts.

Vertex AI Core Components

AutoML

Train high-quality models with no code and no ML expertise:

AutoML Tabular: Classification and regression on structured data

AutoML Image: Image classification and object detection

AutoML Text: Sentiment analysis, entity extraction, classification

AutoML Video: Action recognition, object tracking, classification

AutoML automatically handles:

Feature engineering and selection

Algorithm selection and architecture search

Hyperparameter tuning

Model evaluation and selection

Custom Training

For full control, Vertex AI supports custom training with any framework:

Pre-built containers: TensorFlow, PyTorch, XGBoost, scikit-learn

Custom containers: Any Docker image

Distributed training: Multi-GPU and multi-node

Hyperparameter tuning: Vizier (Google's black-box optimization service)

Model Garden

Vertex AI's model hub with 100+ foundation models:

Google models: Gemini, PaLM 2, Imagen, Codey

Open-source models: LLaMA, Mistral, Falcon, Stable Diffusion

Task-specific models: Pre-trained for common tasks

One-click deployment or fine-tuning

python

1# Vertex AI AutoML — Train a tabular model
2from google.cloud import aiplatform
3
4aiplatform.init(project="my-project", location="us-central1")
5
6# --- Step 1: Create a Dataset ---
7dataset = aiplatform.TabularDataset.create(
8    display_name="customer-churn-dataset",
9    gcs_source="gs://my-bucket/data/churn.csv",
10)
11
12# --- Step 2: Train with AutoML ---
13job = aiplatform.AutoMLTabularTrainingJob(
14    display_name="churn-prediction",
15    optimization_prediction_type="classification",
16    optimization_objective="maximize-au-roc",
17)
18
19model = job.run(
20    dataset=dataset,
21    target_column="churned",
22    training_fraction_split=0.8,
23    validation_fraction_split=0.1,
24    test_fraction_split=0.1,
25    budget_milli_node_hours=1000,  # ~1 hour of training
26)
27
28print(f"Model resource name: {model.resource_name}")
29print(f"Model display name: {model.display_name}")

Custom Training with Vertex AI

python

1# Custom training with a pre-built PyTorch container
2from google.cloud import aiplatform
3
4aiplatform.init(project="my-project", location="us-central1")
5
6# --- Custom Training Job ---
7job = aiplatform.CustomTrainingJob(
8    display_name="fraud-detector-pytorch",
9    script_path="train.py",            # Your training script
10    container_uri=(
11        "us-docker.pkg.dev/vertex-ai/"
12        "training/pytorch-gpu.1-13:latest"
13    ),
14    requirements=["scikit-learn", "pandas"],
15    model_serving_container_image_uri=(
16        "us-docker.pkg.dev/vertex-ai/"
17        "prediction/pytorch-gpu.1-13:latest"
18    ),
19)
20
21# Run the training job
22model = job.run(
23    replica_count=1,
24    machine_type="n1-standard-8",
25    accelerator_type="NVIDIA_TESLA_T4",
26    accelerator_count=1,
27    args=["--epochs", "50", "--lr", "0.001"],
28)
29
30# --- Deploy to endpoint ---
31endpoint = model.deploy(
32    deployed_model_display_name="fraud-detector-v1",
33    machine_type="n1-standard-4",
34    min_replica_count=1,
35    max_replica_count=5,   # Auto-scaling
36)
37
38# Make predictions
39instances = [
40    {"amount": 150.0, "merchant_category": "retail", "hour": 14},
41    {"amount": 5000.0, "merchant_category": "wire_transfer", "hour": 3},
42]
43predictions = endpoint.predict(instances=instances)
44print(predictions)

Vertex AI Feature Store

A centralized, managed repository for ML features:

Avoids training-serving skew: Same feature computation logic for training and inference

Feature sharing: Teams can discover and reuse features across projects

Point-in-time lookups: Get feature values as they were at a specific timestamp

Online serving: Low-latency feature retrieval for real-time inference

Offline serving: Batch export for training dataset creation

Vertex AI Pipelines

Built on Kubeflow Pipelines (open-source), with managed infrastructure:

Define pipelines as Python functions using the KFP SDK

Each step runs in its own container

Automatic metadata tracking and lineage

Integrates with Vertex AI services (training, prediction, feature store)

python

1# Vertex AI Pipeline example using KFP SDK
2from kfp.v2 import dsl, compiler
3from google.cloud import aiplatform
4
5@dsl.component(base_image="python:3.10")
6def preprocess_data(
7    input_path: str,
8    output_path: dsl.OutputPath("Dataset"),
9):
10    """Preprocess raw data and save cleaned dataset."""
11    import pandas as pd
12
13    df = pd.read_csv(input_path)
14    # Clean and transform
15    df = df.dropna()
16    df = df[df["amount"] > 0]
17    df.to_csv(output_path, index=False)
18
19@dsl.component(base_image="python:3.10",
20               packages_to_install=["scikit-learn", "pandas"])
21def train_model(
22    dataset_path: dsl.InputPath("Dataset"),
23    model_path: dsl.OutputPath("Model"),
24    accuracy: dsl.Output[dsl.Metrics],
25):
26    """Train a model and report metrics."""
27    import pandas as pd
28    from sklearn.ensemble import RandomForestClassifier
29    from sklearn.model_selection import train_test_split
30    import pickle
31
32    df = pd.read_csv(dataset_path)
33    X = df.drop("target", axis=1)
34    y = df["target"]
35    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
36
37    model = RandomForestClassifier(n_estimators=100)
38    model.fit(X_train, y_train)
39    acc = model.score(X_test, y_test)
40
41    accuracy.log_metric("accuracy", acc)
42    with open(model_path, "wb") as f:
43        pickle.dump(model, f)
44
45@dsl.pipeline(name="ml-training-pipeline")
46def ml_pipeline(input_data: str = "gs://bucket/data.csv"):
47    preprocess_task = preprocess_data(input_path=input_data)
48    train_task = train_model(
49        dataset_path=preprocess_task.outputs["output_path"]
50    )
51
52# Compile and submit
53compiler.Compiler().compile(
54    pipeline_func=ml_pipeline,
55    package_path="pipeline.json",
56)
57
58aiplatform.PipelineJob(
59    display_name="training-pipeline-run",
60    template_path="pipeline.json",
61    pipeline_root="gs://my-bucket/pipeline-root",
62).submit()

When to Use AutoML vs Custom Training

Use AutoML when: you have structured tabular data, you want quick baseline models, your team lacks deep ML expertise, or you need to prototype fast. Use custom training when: you need specific architectures (transformers, GNNs), require distributed training, need custom loss functions, or need fine-grained control over the training process.

Google Vertex AI

Vertex AI vs SageMaker Philosophy

Vertex AI Core Components

AutoML

Train high-quality models with no code and no ML expertise:

AutoML Tabular: Classification and regression on structured data

AutoML Image: Image classification and object detection

AutoML Text: Sentiment analysis, entity extraction, classification

AutoML Video: Action recognition, object tracking, classification

AutoML automatically handles:

Feature engineering and selection

Algorithm selection and architecture search

Hyperparameter tuning

Model evaluation and selection

Custom Training

For full control, Vertex AI supports custom training with any framework:

Pre-built containers: TensorFlow, PyTorch, XGBoost, scikit-learn

Custom containers: Any Docker image

Distributed training: Multi-GPU and multi-node

Hyperparameter tuning: Vizier (Google's black-box optimization service)

Model Garden

Vertex AI's model hub with 100+ foundation models:

Google models: Gemini, PaLM 2, Imagen, Codey

Open-source models: LLaMA, Mistral, Falcon, Stable Diffusion

Task-specific models: Pre-trained for common tasks

One-click deployment or fine-tuning

python

1# Vertex AI AutoML — Train a tabular model
2from google.cloud import aiplatform
3
4aiplatform.init(project="my-project", location="us-central1")
5
6# --- Step 1: Create a Dataset ---
7dataset = aiplatform.TabularDataset.create(
8    display_name="customer-churn-dataset",
9    gcs_source="gs://my-bucket/data/churn.csv",
10)
11
12# --- Step 2: Train with AutoML ---
13job = aiplatform.AutoMLTabularTrainingJob(
14    display_name="churn-prediction",
15    optimization_prediction_type="classification",
16    optimization_objective="maximize-au-roc",
17)
18
19model = job.run(
20    dataset=dataset,
21    target_column="churned",
22    training_fraction_split=0.8,
23    validation_fraction_split=0.1,
24    test_fraction_split=0.1,
25    budget_milli_node_hours=1000,  # ~1 hour of training
26)
27
28print(f"Model resource name: {model.resource_name}")
29print(f"Model display name: {model.display_name}")

Custom Training with Vertex AI

python

1# Custom training with a pre-built PyTorch container
2from google.cloud import aiplatform
3
4aiplatform.init(project="my-project", location="us-central1")
5
6# --- Custom Training Job ---
7job = aiplatform.CustomTrainingJob(
8    display_name="fraud-detector-pytorch",
9    script_path="train.py",            # Your training script
10    container_uri=(
11        "us-docker.pkg.dev/vertex-ai/"
12        "training/pytorch-gpu.1-13:latest"
13    ),
14    requirements=["scikit-learn", "pandas"],
15    model_serving_container_image_uri=(
16        "us-docker.pkg.dev/vertex-ai/"
17        "prediction/pytorch-gpu.1-13:latest"
18    ),
19)
20
21# Run the training job
22model = job.run(
23    replica_count=1,
24    machine_type="n1-standard-8",
25    accelerator_type="NVIDIA_TESLA_T4",
26    accelerator_count=1,
27    args=["--epochs", "50", "--lr", "0.001"],
28)
29
30# --- Deploy to endpoint ---
31endpoint = model.deploy(
32    deployed_model_display_name="fraud-detector-v1",
33    machine_type="n1-standard-4",
34    min_replica_count=1,
35    max_replica_count=5,   # Auto-scaling
36)
37
38# Make predictions
39instances = [
40    {"amount": 150.0, "merchant_category": "retail", "hour": 14},
41    {"amount": 5000.0, "merchant_category": "wire_transfer", "hour": 3},
42]
43predictions = endpoint.predict(instances=instances)
44print(predictions)

Vertex AI Feature Store

A centralized, managed repository for ML features:

Avoids training-serving skew: Same feature computation logic for training and inference

Feature sharing: Teams can discover and reuse features across projects

Point-in-time lookups: Get feature values as they were at a specific timestamp

Online serving: Low-latency feature retrieval for real-time inference

Offline serving: Batch export for training dataset creation

Vertex AI Pipelines

Built on Kubeflow Pipelines (open-source), with managed infrastructure:

Define pipelines as Python functions using the KFP SDK

Each step runs in its own container

Automatic metadata tracking and lineage

Integrates with Vertex AI services (training, prediction, feature store)

python

1# Vertex AI Pipeline example using KFP SDK
2from kfp.v2 import dsl, compiler
3from google.cloud import aiplatform
4
5@dsl.component(base_image="python:3.10")
6def preprocess_data(
7    input_path: str,
8    output_path: dsl.OutputPath("Dataset"),
9):
10    """Preprocess raw data and save cleaned dataset."""
11    import pandas as pd
12
13    df = pd.read_csv(input_path)
14    # Clean and transform
15    df = df.dropna()
16    df = df[df["amount"] > 0]
17    df.to_csv(output_path, index=False)
18
19@dsl.component(base_image="python:3.10",
20               packages_to_install=["scikit-learn", "pandas"])
21def train_model(
22    dataset_path: dsl.InputPath("Dataset"),
23    model_path: dsl.OutputPath("Model"),
24    accuracy: dsl.Output[dsl.Metrics],
25):
26    """Train a model and report metrics."""
27    import pandas as pd
28    from sklearn.ensemble import RandomForestClassifier
29    from sklearn.model_selection import train_test_split
30    import pickle
31
32    df = pd.read_csv(dataset_path)
33    X = df.drop("target", axis=1)
34    y = df["target"]
35    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
36
37    model = RandomForestClassifier(n_estimators=100)
38    model.fit(X_train, y_train)
39    acc = model.score(X_test, y_test)
40
41    accuracy.log_metric("accuracy", acc)
42    with open(model_path, "wb") as f:
43        pickle.dump(model, f)
44
45@dsl.pipeline(name="ml-training-pipeline")
46def ml_pipeline(input_data: str = "gs://bucket/data.csv"):
47    preprocess_task = preprocess_data(input_path=input_data)
48    train_task = train_model(
49        dataset_path=preprocess_task.outputs["output_path"]
50    )
51
52# Compile and submit
53compiler.Compiler().compile(
54    pipeline_func=ml_pipeline,
55    package_path="pipeline.json",
56)
57
58aiplatform.PipelineJob(
59    display_name="training-pipeline-run",
60    template_path="pipeline.json",
61    pipeline_root="gs://my-bucket/pipeline-root",
62).submit()