File size: 1,912 Bytes
f24563f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
#!/bin/bash
# setup_environment.sh - Script to set up the environment for training a powerful LLM on TPU v4-32
# This script will clone MaxText, install dependencies, and configure the environment
set -e # Exit on any error
echo "Setting up environment for LLM training on TPU v4-32..."
# Create directories
mkdir -p logs
mkdir -p checkpoints
# Clone MaxText repository
if [ ! -d "maxtext" ]; then
echo "Cloning MaxText repository..."
git clone https://github.com/AI-Hypercomputer/maxtext.git
cd maxtext
else
echo "MaxText repository already exists, updating..."
cd maxtext
git pull
fi
# Install dependencies
echo "Installing dependencies..."
bash setup.sh
pre-commit install
# Set up environment variables
echo "Setting up environment variables..."
export PYTHONPATH=$PYTHONPATH:$(pwd)
export JAX_PLATFORMS="tpu"
# Check TPU configuration
echo "Checking TPU configuration..."
python3 -c "import jax; print(f'TPU devices: {jax.device_count()}')"
python3 -c "import jax; print(f'TPU type: {jax.devices()[0].platform}')"
# Create a GCS bucket for checkpoints and logs if it doesn't exist
# Note: You need to replace YOUR_PROJECT_ID with your actual GCP project ID
if [ -z "$GCS_BUCKET" ]; then
echo "Please set the GCS_BUCKET environment variable"
echo "Example: export GCS_BUCKET=gs://your-bucket-name"
exit 1
fi
echo "Checking GCS bucket access..."
gsutil ls $GCS_BUCKET > /dev/null || (echo "Creating GCS bucket $GCS_BUCKET..." && gsutil mb -p $PROJECT_ID $GCS_BUCKET)
# Set up Cloud Storage FUSE for efficient data access
echo "Setting up Cloud Storage FUSE..."
BUCKET_NAME=$(echo $GCS_BUCKET | sed 's/gs:\/\///')
MOUNT_PATH="/tmp/gcsfuse"
bash setup_gcsfuse.sh DATASET_GCS_BUCKET=$BUCKET_NAME MOUNT_PATH=$MOUNT_PATH
echo "Environment setup complete!"
cd ..
# Return to the original directory
echo "You can now run the training script with: bash tpu_train.sh"
|