diff --git a/examples/llm-foundry/aks/helm/llm-training/templates/pytorchjob.yaml b/examples/llm-foundry/aks/helm/llm-training/templates/pytorchjob.yaml index 2517a71..ae518b4 100644 --- a/examples/llm-foundry/aks/helm/llm-training/templates/pytorchjob.yaml +++ b/examples/llm-foundry/aks/helm/llm-training/templates/pytorchjob.yaml @@ -43,6 +43,7 @@ spec: ulimit -s unlimited && \ ulimit -l unlimited && \ ulimit -a && \ + LOG_FILE="/tmp/$(hostname).out" && \ composer \ --world_size {{ mul .Values.training.nodes .Values.training.gpusPerNode }} \ --node_rank $RANK \ @@ -55,6 +56,13 @@ spec: {{- range $key, $value := .Values.yamlUpdates }} {{ $key }}={{ $value }} \ {{- end }} + 2>&1 | tee "$LOG_FILE" + EXIT_CODE=$? + {{- if .Values.logs.copyLogsDir }} + mkdir -p {{ .Values.logs.copyLogsDir }} + cp "$LOG_FILE" {{ .Values.logs.copyLogsDir }}/ || echo "Failed to copy logs" + {{- end }} + exit $EXIT_CODE env: {{- include "llm-training.nccl-env" . | nindent 12 }} {{- include "llm-training.sharp-env" . | nindent 12 }} @@ -118,6 +126,7 @@ spec: ulimit -s unlimited && \ ulimit -l unlimited && \ ulimit -a && \ + LOG_FILE="/tmp/$(hostname).out" && \ composer \ --world_size {{ mul .Values.training.nodes .Values.training.gpusPerNode }} \ --node_rank $RANK \ @@ -130,6 +139,13 @@ spec: {{- range $key, $value := .Values.yamlUpdates }} {{ $key }}={{ $value }} \ {{- end }} + 2>&1 | tee "$LOG_FILE" + EXIT_CODE=$? + {{- if .Values.logs.copyLogsDir }} + mkdir -p {{ .Values.logs.copyLogsDir }} + cp "$LOG_FILE" {{ .Values.logs.copyLogsDir }}/ || echo "Failed to copy logs" + {{- end }} + exit $EXIT_CODE env: {{- include "llm-training.nccl-env" . | nindent 12 }} {{- include "llm-training.sharp-env" . | nindent 12 }} diff --git a/examples/llm-foundry/aks/helm/llm-training/values.yaml b/examples/llm-foundry/aks/helm/llm-training/values.yaml index 6fcfa3e..e777171 100644 --- a/examples/llm-foundry/aks/helm/llm-training/values.yaml +++ b/examples/llm-foundry/aks/helm/llm-training/values.yaml @@ -18,6 +18,11 @@ training: kueue: queueName: "" # e.g., "gpu-local-queue" to use Kueue +# Logs configuration (optional) +# Set copyLogsDir to save stdout/stderr logs to a directory +logs: + copyLogsDir: "" # e.g., "/mnt/logs" - logs saved as $(hostname).out + # Additional YAML updates for the training configuration yamlUpdates: {} # Example: