diff --git a/dev-tools/mcp-mock-server/server.py b/dev-tools/mcp-mock-server/server.py index b7e17fffb..a6f09dcbb 100644 --- a/dev-tools/mcp-mock-server/server.py +++ b/dev-tools/mcp-mock-server/server.py @@ -6,16 +6,19 @@ useful for validating that Lightspeed Core Stack correctly sends auth headers to MCP servers. -The server runs both HTTP and HTTPS simultaneously on consecutive ports. +The server runs HTTP and optionally HTTPS on consecutive ports. +Set MCP_HTTP_ONLY=true to disable HTTPS (useful when openssl is unavailable). Usage: python server.py [http_port] Example: python server.py 3000 # HTTP on 3000, HTTPS on 3001 + MCP_HTTP_ONLY=true python server.py 3000 # HTTP only on 3000 """ import json +import os import ssl import subprocess import sys @@ -268,31 +271,43 @@ def run_https_server(port: int, httpd: HTTPServer) -> None: def main() -> None: - """Start the mock MCP server with both HTTP and HTTPS.""" + """Start the mock MCP server with HTTP and optionally HTTPS.""" http_port = int(sys.argv[1]) if len(sys.argv) > 1 else 3000 - https_port = http_port + 1 + http_only = os.environ.get("MCP_HTTP_ONLY", "").lower() in ("true", "1", "yes") # Create HTTP server http_server = HTTPServer(("", http_port), MCPMockHandler) - # Create HTTPS server with self-signed certificate - https_server = HTTPServer(("", https_port), MCPMockHandler) - - # Generate or load self-signed certificate - script_dir = Path(__file__).parent - cert_dir = script_dir / ".certs" - cert_file, key_file = generate_self_signed_cert(cert_dir) - - # Wrap socket with SSL - context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) - context.load_cert_chain(cert_file, key_file) - https_server.socket = context.wrap_socket(https_server.socket, server_side=True) + https_server = None + if not http_only: + try: + https_port = http_port + 1 + https_server = HTTPServer(("", https_port), MCPMockHandler) + + # Generate or load self-signed certificate + script_dir = Path(__file__).parent + cert_dir = script_dir / ".certs" + cert_file, key_file = generate_self_signed_cert(cert_dir) + + # Wrap socket with SSL + context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + context.load_cert_chain(cert_file, key_file) + https_server.socket = context.wrap_socket( + https_server.socket, server_side=True + ) + except (subprocess.CalledProcessError, FileNotFoundError, OSError) as e: + print(f"HTTPS setup failed ({e}), running HTTP only") + https_server = None print("=" * 70) - print("MCP Mock Server starting with HTTP and HTTPS") + if https_server: + print("MCP Mock Server starting with HTTP and HTTPS") + else: + print("MCP Mock Server starting (HTTP only)") print("=" * 70) print(f"HTTP: http://localhost:{http_port}") - print(f"HTTPS: https://localhost:{https_port}") + if https_server: + print(f"HTTPS: https://localhost:{https_port}") print("=" * 70) print("Debug endpoints:") print(" • /debug/headers - View captured headers") @@ -300,29 +315,35 @@ def main() -> None: print("MCP endpoint:") print(" • POST to any path (e.g., / or /mcp/v1/list_tools)") print("=" * 70) - print("Note: HTTPS uses a self-signed certificate (for testing only)") + if https_server: + print("Note: HTTPS uses a self-signed certificate (for testing only)") print("Press Ctrl+C to stop") print() - # Start servers in separate threads + # Start HTTP server in a thread http_thread = threading.Thread( target=run_http_server, args=(http_port, http_server), daemon=True ) - https_thread = threading.Thread( - target=run_https_server, args=(https_port, https_server), daemon=True - ) - http_thread.start() - https_thread.start() + + # Start HTTPS server if available + https_thread = None + if https_server: + https_thread = threading.Thread( + target=run_https_server, args=(https_port, https_server), daemon=True + ) + https_thread.start() try: # Keep main thread alive http_thread.join() - https_thread.join() + if https_thread: + https_thread.join() except KeyboardInterrupt: print("\nShutting down mock servers...") http_server.shutdown() - https_server.shutdown() + if https_server: + https_server.shutdown() if __name__ == "__main__": diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml new file mode 100644 index 000000000..4dfd3ed4e --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml @@ -0,0 +1,31 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + # Uses a remote llama-stack service + # The instance would have already been started with a llama-stack-run.yaml file + use_as_library_client: false + # Alternative for "as library use" + # use_as_library_client: true + # library_client_config_path: + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# Conversation cache for storing Q&A history +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +authentication: + module: "noop-with-token" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml new file mode 100644 index 000000000..e2b468cf0 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml @@ -0,0 +1,25 @@ +name: Lightspeed Core Service (LCS) - RH Identity Auth +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: true + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: false + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" +authentication: + module: "rh-identity" + rh_identity_config: + required_entitlements: ["rhel"] diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml new file mode 100644 index 000000000..eb6ba2054 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml @@ -0,0 +1,25 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + # Uses a remote llama-stack service + # The instance would have already been started with a llama-stack-run.yaml file + use_as_library_client: false + # Alternative for "as library use" + # use_as_library_client: true + # library_client_config_path: + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/invalid" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +authentication: + module: "noop-with-token" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml new file mode 100644 index 000000000..6c8f31438 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml @@ -0,0 +1,27 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + # Uses a remote llama-stack service + # The instance would have already been started with a llama-stack-run.yaml file + use_as_library_client: false + # Alternative for "as library use" + # use_as_library_client: true + # library_client_config_path: + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# NO conversation_cache configured - for testing error handling + +authentication: + module: "noop-with-token" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml new file mode 100644 index 000000000..e2e4bfa16 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml @@ -0,0 +1,94 @@ +name: Lightspeed Core Service (RBAC E2E Tests) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: true + workers: 1 + color_log: true + access_log: true + +llama_stack: + use_as_library_client: false + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy + +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# Conversation cache for storing Q&A history +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +# JWK token authentication with role extraction +authentication: + module: "jwk-token" + jwk_config: + url: "http://mock-jwks:8000/.well-known/jwks.json" + jwt_configuration: + user_id_claim: "sub" + username_claim: "name" + # Role rules: extract roles from JWT claims + role_rules: + # Grant 'admin' role to users with admin=true in JWT + - jsonpath: "$.admin" + operator: "equals" + value: [true] + roles: ["admin"] + # Grant 'user' role to users with role=user in JWT + - jsonpath: "$.role" + operator: "equals" + value: ["user"] + roles: ["user"] + # Grant 'viewer' role to users with role=viewer in JWT + - jsonpath: "$.role" + operator: "equals" + value: ["viewer"] + roles: ["viewer"] + # Grant 'query_only' role based on permissions array containing 'query' + - jsonpath: "$.permissions[*]" + operator: "contains" + value: "query" + roles: ["query_only"] + +# Authorization: map roles to actions +authorization: + access_rules: + # Admin role gets full access + - role: "admin" + actions: ["admin"] + # User role can query, access conversations, and provide feedback + - role: "user" + actions: + - "query" + - "streaming_query" + - "get_conversation" + - "list_conversations" + - "delete_conversation" + - "update_conversation" + - "feedback" + - "get_models" + - "get_tools" + - "info" + - "model_override" + # Viewer role can only read (no mutations) + - role: "viewer" + actions: + - "get_conversation" + - "list_conversations" + - "get_models" + - "get_tools" + - "info" + # Query-only role can only query (no model_override - must use defaults) + - role: "query_only" + actions: + - "query" + - "streaming_query" + # Everyone (*) role gets basic info access + - role: "*" + actions: + - "info" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml index cd667a4f0..b1fecbdbb 100644 --- a/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml @@ -23,3 +23,21 @@ user_data_collection: authentication: module: "noop" + +mcp_servers: + # Mock server with client-provided auth - should appear in mcp-auth/client-options response + - name: "github-api" + provider_id: "model-context-protocol" + url: "http://mcp-mock-server:3000" + authorization_headers: + Authorization: "client" + # Mock server with client-provided auth (different header) - should appear in response + - name: "gitlab-api" + provider_id: "model-context-protocol" + url: "http://mcp-mock-server:3000" + authorization_headers: + X-API-Token: "client" + # Mock server with no auth - should NOT appear in response + - name: "public-api" + provider_id: "model-context-protocol" + url: "http://mcp-mock-server:3000" diff --git a/tests/e2e-prow/rhoai/configs/run.yaml b/tests/e2e-prow/rhoai/configs/run.yaml index 20f11f547..935ae206f 100644 --- a/tests/e2e-prow/rhoai/configs/run.yaml +++ b/tests/e2e-prow/rhoai/configs/run.yaml @@ -1,5 +1,5 @@ version: 2 -image_name: rhoai-configuration +image_name: starter apis: - agents @@ -12,24 +12,19 @@ apis: - scoring - tool_runtime - vector_io - + benchmarks: [] datasets: [] -# external_providers_dir: /opt/app-root/src/.llama/providers.d providers: inference: - provider_id: vllm provider_type: remote::vllm config: - url: ${env.KSVC_URL}/v1/ + base_url: ${env.KSVC_URL}/v1/ api_token: ${env.VLLM_API_KEY} tls_verify: false - max_tokens: 1024 - # - provider_id: openai - # provider_type: remote::openai - # config: - # api_key: ${env.OPENAI_API_KEY} + max_tokens: 512 - config: {} provider_id: sentence-transformers provider_type: inline::sentence-transformers @@ -38,24 +33,34 @@ providers: metadata_store: table_name: files_metadata backend: sql_default - storage_dir: ~/.llama/storage/files + storage_dir: /opt/app-root/src/.llama/storage/files provider_id: meta-reference-files provider_type: inline::localfs safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] + - config: + excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard scoring: - - config: {} - provider_id: basic + - provider_id: basic provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: '********' tool_runtime: - - config: {} + - config: {} # Enable the RAG tool provider_id: rag-runtime provider_type: inline::rag-runtime + - config: {} # Enable MCP (Model Context Protocol) support + provider_id: model-context-protocol + provider_type: remote::model-context-protocol vector_io: - - config: + - config: # Define the storage backend for RAG persistence: namespace: vector_io::faiss backend: kv_rag @@ -104,12 +109,15 @@ server: port: 8321 storage: backends: - kv_default: + kv_default: # Single database for registry AND RAG data type: kv_sqlite - db_path: ${env.KV_STORE_PATH:=~/.llama/storage/rag/kv_store.db} + db_path: /opt/app-root/src/.llama/storage/rag/kv_store.db + kv_rag: + type: kv_sqlite + db_path: /opt/app-root/src/.llama/storage/rag/kv_store.db sql_default: type: sql_sqlite - db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db} + db_path: ${env.SQL_STORE_PATH:=/opt/app-root/src/.llama/storage/sql_store.db} stores: metadata: namespace: registry @@ -127,7 +135,7 @@ storage: backend: kv_default registered_resources: models: - - model_id: meta-llama/Llama-3.2-1B-Instruct + - model_id: meta-llama/Llama-3.1-8B-Instruct provider_id: vllm model_type: llm provider_model_id: null @@ -141,20 +149,20 @@ registered_resources: - embedding_dimension: 768 embedding_model: sentence-transformers/all-mpnet-base-v2 provider_id: faiss - vector_store_id: vs_8c94967b-81cc-4028-a294-9cfac6fd9ae2 #TODO: parse this from the rag db + vector_store_id: ${env.FAISS_VECTOR_STORE_ID} shields: - shield_id: llama-guard provider_id: llama-guard - provider_shield_id: vllm/meta-llama/Llama-3.2-1B-Instruct + provider_shield_id: vllm/meta-llama/Llama-3.1-8B-Instruct datasets: [] scoring_fns: [] benchmarks: [] tool_groups: - - toolgroup_id: builtin::rag - provider_id: rag-runtime + - provider_id: rag-runtime + toolgroup_id: builtin::rag vector_stores: default_provider_id: faiss - default_embedding_model: + default_embedding_model: # Define the default embedding model for RAG provider_id: sentence-transformers model_id: all-mpnet-base-v2 safety: diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml index 005f96978..31d934e28 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml @@ -6,8 +6,25 @@ metadata: spec: imagePullSecrets: - name: quay-lightspeed-pull-secret + initContainers: + - name: setup-rag-data + image: busybox:latest + command: + - /bin/sh + - -c + - | + mkdir -p /data/storage/rag + gunzip -c /rag-data/kv_store.db.gz > /data/storage/rag/kv_store.db + echo "RAG data extracted successfully" + ls -la /data/storage/rag/ + volumeMounts: + - name: app-root + mountPath: /data + - name: rag-data + mountPath: /rag-data containers: - name: llama-stack-container + command: ["llama", "stack", "run", "/opt/app-root/run.yaml"] env: - name: KSVC_URL valueFrom: @@ -19,6 +36,13 @@ spec: secretKeyRef: name: vllm-api-key-secret key: key + - name: INFERENCE_MODEL + value: "meta-llama/Llama-3.1-8B-Instruct" + - name: FAISS_VECTOR_STORE_ID + valueFrom: + secretKeyRef: + name: faiss-vector-store-secret + key: id image: ${LLAMA_STACK_IMAGE} ports: - containerPort: 8321 @@ -34,3 +58,6 @@ spec: - name: config configMap: name: llama-stack-config + - name: rag-data + configMap: + name: rag-data diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml new file mode 100644 index 000000000..a8e236f68 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml @@ -0,0 +1,50 @@ +apiVersion: v1 +kind: Pod +metadata: + name: mcp-mock-server + namespace: e2e-rhoai-dsc + labels: + app: mcp-mock-server +spec: + containers: + - name: mcp-mock-server + image: python:3.12-slim + # Run HTTP-only version of the mock server + command: ["python", "/app/server.py", "3000"] + env: + - name: MCP_HTTP_ONLY + value: "true" + ports: + - containerPort: 3000 + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readinessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: server-script + configMap: + name: mcp-mock-server-script +--- +apiVersion: v1 +kind: Service +metadata: + name: mcp-mock-server + namespace: e2e-rhoai-dsc +spec: + selector: + app: mcp-mock-server + ports: + - port: 3000 + targetPort: 3000 diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml new file mode 100644 index 000000000..b1555c730 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: Pod +metadata: + name: mock-jwks + namespace: e2e-rhoai-dsc + labels: + app: mock-jwks +spec: + containers: + - name: mock-jwks + image: python:3.12-slim + command: ["python", "/app/server.py"] + ports: + - containerPort: 8000 + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 2 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 10 + volumes: + - name: server-script + configMap: + name: mock-jwks-script +--- +apiVersion: v1 +kind: Service +metadata: + name: mock-jwks + namespace: e2e-rhoai-dsc +spec: + selector: + app: mock-jwks + ports: + - port: 8000 + targetPort: 8000 diff --git a/tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml b/tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml deleted file mode 100644 index f11778c0d..000000000 --- a/tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: test-pod - namespace: e2e-rhoai-dsc -spec: - containers: - - name: test-container - env: - - name: E2E_LSC_HOSTNAME - valueFrom: - secretKeyRef: - name: lcs-ip-secret - key: key - - name: E2E_LLAMA_HOSTNAME - valueFrom: - secretKeyRef: - name: llama-stack-ip-secret - key: key - image: registry.access.redhat.com/ubi9/python-312 - command: ["/bin/sh", "/scripts/run-tests.sh"] - volumeMounts: - - name: script-volume - mountPath: /scripts - volumes: - - name: script-volume - configMap: - name: test-script-cm - defaultMode: 0755 # Make the script executable - restartPolicy: Never \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml index 4c81d6b01..4c3f5e7bd 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml @@ -13,7 +13,7 @@ spec: containers: - args: # - /mnt/models/ - - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Llama-3.1-8B-Instruct - --enable-auto-tool-choice - --tool-call-parser - llama3_json diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml index 2027cfcf2..b7597991c 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml @@ -13,12 +13,12 @@ spec: containers: - args: - --model - - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Llama-3.1-8B-Instruct - --enable-auto-tool-choice - --tool-call-parser - llama3_json - --chat-template - - /mnt/chat-template/tool_chat_template_llama3.2_json.jinja + - /mnt/chat-template/tool_chat_template_llama3.1_json.jinja - --download-dir - /tmp/models-cache - --port diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index 8d011bce7..cd33ab9d5 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -2,10 +2,11 @@ BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Deploy llama-stack envsubst < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -f - oc wait pod/llama-stack-service \ --n e2e-rhoai-dsc --for=condition=Ready --timeout=600s + -n e2e-rhoai-dsc --for=condition=Ready --timeout=600s # Get url address of llama-stack pod oc label pod llama-stack-service pod=llama-stack-service -n e2e-rhoai-dsc @@ -22,4 +23,5 @@ oc create secret generic llama-stack-ip-secret \ --from-literal=key="$E2E_LLAMA_HOSTNAME" \ -n e2e-rhoai-dsc || echo "Secret exists" -oc apply -f "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" \ No newline at end of file +# Deploy lightspeed-stack +oc apply -f "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 718dc36ae..e94b7e6f8 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -2,22 +2,19 @@ set -euo pipefail trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR +# Signal to e2e tests that we're running in Prow/OpenShift +export RUNNING_PROW=true #======================================== # 1. GLOBAL CONFIG #======================================== NAMESPACE="e2e-rhoai-dsc" -MODEL_NAME="meta-llama/Llama-3.2-1B-Instruct" +MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Get llama-stack image from GitHub Containerfile -echo "Fetching llama-stack image from GitHub..." -LLAMA_STACK_IMAGE=$(curl -sL https://raw.githubusercontent.com/lightspeed-core/lightspeed-stack/main/test.containerfile | grep -m1 '^FROM' | awk '{print $2}') -if [ -z "$LLAMA_STACK_IMAGE" ]; then - echo "❌ Failed to fetch llama-stack image from GitHub" - exit 1 -fi -echo " -> Found llama-stack image: $LLAMA_STACK_IMAGE" +# RHOAI llama-stack image +LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.3}" +echo "Using llama-stack image: $LLAMA_STACK_IMAGE" export LLAMA_STACK_IMAGE #======================================== @@ -77,12 +74,12 @@ oc secrets link default quay-lightspeed-pull-secret --for=pull -n "$NAMESPACE" 2 #======================================== echo "===== Setting up configmaps =====" -curl -sL -o tool_chat_template_llama3.2_json.jinja \ - https://raw.githubusercontent.com/vllm-project/vllm/main/examples/tool_chat_template_llama3.2_json.jinja \ +curl -sL -o tool_chat_template_llama3.1_json.jinja \ + https://raw.githubusercontent.com/vllm-project/vllm/main/examples/tool_chat_template_llama3.1_json.jinja \ || { echo "❌ Failed to download jinja template"; exit 1; } oc create configmap vllm-chat-template -n "$NAMESPACE" \ - --from-file=tool_chat_template_llama3.2_json.jinja --dry-run=client -o yaml | oc apply -f - + --from-file=tool_chat_template_llama3.1_json.jinja --dry-run=client -o yaml | oc apply -f - #======================================== @@ -97,7 +94,7 @@ oc get pods -n "$NAMESPACE" # 6. WAIT FOR POD & TEST API #======================================== source pod.env -oc wait --for=condition=Ready pod/$POD_NAME -n $NAMESPACE --timeout=300s +oc wait --for=condition=Ready pod/$POD_NAME -n $NAMESPACE --timeout=600s echo "===== Testing vLLM endpoint =====" start_time=$(date +%s) @@ -154,14 +151,89 @@ oc delete pod vllm-test-curl -n "$NAMESPACE" --ignore-not-found=true #======================================== -# 7. DEPLOY LIGHTSPEED STACK AND LLAMA STACK +# 7. DEPLOY MOCK SERVERS (JWKS & MCP) +#======================================== +echo "===== Deploying Mock Servers =====" + +# Navigate to repo root to access server scripts +REPO_ROOT="$(cd "$PIPELINE_DIR/../../.." && pwd)" + +# Create ConfigMaps from server scripts +echo "Creating mock server ConfigMaps..." +oc create configmap mock-jwks-script -n "$NAMESPACE" \ + --from-file=server.py="$REPO_ROOT/tests/e2e/mock_jwks_server/server.py" \ + --dry-run=client -o yaml | oc apply -f - + +oc create configmap mcp-mock-server-script -n "$NAMESPACE" \ + --from-file=server.py="$REPO_ROOT/dev-tools/mcp-mock-server/server.py" \ + --dry-run=client -o yaml | oc apply -f - + +# Deploy mock server pods and services +echo "Deploying mock-jwks..." +oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mock-jwks.yaml" + +echo "Deploying mcp-mock-server..." +oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mcp-mock-server.yaml" + +# Wait for mock servers to be ready +echo "Waiting for mock servers to be ready..." +oc wait pod/mock-jwks pod/mcp-mock-server \ + -n "$NAMESPACE" --for=condition=Ready --timeout=120s || { + echo "⚠️ Mock servers not ready, checking status..." + oc get pods -n "$NAMESPACE" | grep -E "mock-jwks|mcp-mock" || true + oc describe pod mock-jwks -n "$NAMESPACE" 2>/dev/null | tail -20 || true + oc describe pod mcp-mock-server -n "$NAMESPACE" 2>/dev/null | tail -20 || true + echo "❌ Mock servers failed to become ready" + exit 1 +} +echo "✅ Mock servers deployed" + +#======================================== +# 8. DEPLOY LIGHTSPEED STACK AND LLAMA STACK #======================================== echo "===== Deploying Services =====" create_secret api-url-secret --from-literal=key="$KSVC_URL" oc create configmap llama-stack-config -n "$NAMESPACE" --from-file=configs/run.yaml oc create configmap lightspeed-stack-config -n "$NAMESPACE" --from-file=configs/lightspeed-stack.yaml -oc create configmap test-script-cm -n "$NAMESPACE" --from-file=run-tests.sh + +# Create RAG data ConfigMap from the e2e test RAG data +echo "Creating RAG data ConfigMap..." +RAG_DB_PATH="$REPO_ROOT/tests/e2e/rag/kv_store.db" +if [ -f "$RAG_DB_PATH" ]; then + # Extract vector store ID from kv_store.db using Python (sqlite3 CLI may not be available) + echo "Extracting vector store ID from kv_store.db..." + # Key format is: vector_stores:v3::vs_xxx or openai_vector_stores:v3::vs_xxx + export FAISS_VECTOR_STORE_ID=$(python3 -c " +import sqlite3 +import re +conn = sqlite3.connect('$RAG_DB_PATH') +cursor = conn.cursor() +cursor.execute(\"SELECT key FROM kvstore WHERE key LIKE 'vector_stores:v%::%' LIMIT 1\") +row = cursor.fetchone() +if row: + # Extract the vs_xxx ID from the key + match = re.search(r'(vs_[a-f0-9-]+)', row[0]) + if match: + print(match.group(1)) +conn.close() +" 2>/dev/null || echo "") + + if [ -n "$FAISS_VECTOR_STORE_ID" ]; then + echo "✅ Extracted FAISS_VECTOR_STORE_ID: $FAISS_VECTOR_STORE_ID" + # Create secret for llama-stack to use + create_secret faiss-vector-store-secret --from-literal=id="$FAISS_VECTOR_STORE_ID" + else + echo "❌ No vector_store found in $RAG_DB_PATH - FAISS tests will fail!" + fi + + gzip -c "$RAG_DB_PATH" > /tmp/kv_store.db.gz + oc create configmap rag-data -n "$NAMESPACE" --from-file=kv_store.db.gz=/tmp/kv_store.db.gz + rm /tmp/kv_store.db.gz + echo "✅ RAG data ConfigMap created from $RAG_DB_PATH" +else + echo "⚠️ No kv_store.db found at $RAG_DB_PATH" +fi ./pipeline-services.sh @@ -191,7 +263,6 @@ if ! oc wait pod/lightspeed-stack-service pod/llama-stack-service \ exit 1 fi echo "✅ Both service pods are ready" -sleep 30 oc get pods -n "$NAMESPACE" @@ -205,7 +276,7 @@ oc describe pod llama-stack-service -n "$NAMESPACE" || true #======================================== -# 8. EXTRACT LCS IP & STORE +# 9. EXPOSE SERVICE & START PORT-FORWARD #======================================== oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n $NAMESPACE @@ -215,54 +286,78 @@ oc expose pod lightspeed-stack-service \ --type=ClusterIP \ -n $NAMESPACE -E2E_LSC_HOSTNAME="lightspeed-stack-service-svc.$NAMESPACE.svc.cluster.local" -echo "LCS IP: $E2E_LSC_HOSTNAME" - -create_secret lcs-ip-secret --from-literal=key="$E2E_LSC_HOSTNAME" - - -#======================================== -# 9. LOGGING & TEST EXECUTION -#======================================== -echo "===== Running test pod =====" -./pipeline-test-pod.sh - -sleep 20 -oc get pods -n "$NAMESPACE" - -# Wait until tests are complete -oc wait --for=condition=Ready=True pod/test-pod -n $NAMESPACE --timeout=900s || oc wait --for=condition=Ready=False pod/test-pod -n $NAMESPACE --timeout=60s - -start_time=$(date +%s) -timeout=2400 -while true; do - sleep 120 - - PHASE=$(oc get pod test-pod -n $NAMESPACE -o jsonpath='{.status.phase}') - echo "Current phase test-pod: $PHASE" - if [[ "$PHASE" == "Succeeded" || "$PHASE" == "Failed" ]]; then - break +# Kill any existing processes on ports 8080 and 8000 +echo "Checking for existing processes on ports 8080 and 8000..." +lsof -ti:8080 | xargs kill -9 2>/dev/null || true +lsof -ti:8000 | xargs kill -9 2>/dev/null || true + +# Start port-forward for lightspeed-stack +echo "Starting port-forward for lightspeed-stack..." +oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & +PF_LCS_PID=$! + +# Start port-forward for mock-jwks (needed for RBAC tests to get tokens) +echo "Starting port-forward for mock-jwks..." +oc port-forward svc/mock-jwks 8000:8000 -n $NAMESPACE & +PF_JWKS_PID=$! + +# Wait for port-forward to be usable (app may not be listening immediately; port-forward can drop) +echo "Waiting for port-forward to lightspeed-stack to be ready..." +for i in $(seq 1 36); do + if curl -sf http://localhost:8080/v1/models > /dev/null 2>&1; then + echo "✅ Port-forward ready after $(( i * 5 ))s" + break fi - - current_time=$(date +%s) - elapsed=$((current_time - start_time)) - - if (( elapsed >= timeout )); then - echo "⏰ Timeout reached ($timeout seconds). Stopping test." - exit 1 + if [ $i -eq 36 ]; then + echo "❌ Port-forward to lightspeed-stack never became ready (3 min)" + kill $PF_LCS_PID 2>/dev/null || true + kill $PF_JWKS_PID 2>/dev/null || true + exit 1 fi - - oc get pods -n "$NAMESPACE" + # If port-forward process died, restart it (e.g. "connection refused" / "lost connection to pod") + if ! kill -0 $PF_LCS_PID 2>/dev/null; then + echo "Port-forward died, restarting (attempt $i)..." + oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & + PF_LCS_PID=$! + fi + sleep 5 done -oc logs test-pod -n $NAMESPACE || oc describe pod test-pod -n $NAMESPACE || true + +export E2E_LSC_HOSTNAME="localhost" +export E2E_JWKS_HOSTNAME="localhost" +echo "LCS accessible at: http://$E2E_LSC_HOSTNAME:8080" +echo "Mock JWKS accessible at: http://$E2E_JWKS_HOSTNAME:8000" -TEST_EXIT_CODE=$(oc get pod test-pod -n $NAMESPACE -o jsonpath='{.status.containerStatuses[0].state.terminated.exitCode}') + +#======================================== +# 10. RUN TESTS +#======================================== +echo "===== Running E2E tests =====" + +# Ensure run-tests.sh is executable +chmod +x ./run-tests.sh + +# Run tests and cleanup port-forwards. Disable ERR trap so we can capture test exit code and reap +# killed port-forwards without the trap firing (ERR fires on any non-zero exit, not only when set -e would exit). +trap - ERR +set +e +export E2E_EXIT_CODE_FILE="${PIPELINE_DIR}/.e2e_exit_code" +./run-tests.sh +# Read exit code from file so we get the real test result (shell can overwrite $? with "PID Killed" before we use it) +TEST_EXIT_CODE=$(cat "$E2E_EXIT_CODE_FILE" 2>/dev/null || echo 1) +# Kill first so wait doesn't block (if a port-forward is still running, wait would hang) +kill $PF_LCS_PID 2>/dev/null || true +kill $PF_JWKS_PID 2>/dev/null || true +wait $PF_LCS_PID 2>/dev/null || true +wait $PF_JWKS_PID 2>/dev/null || true +set -e +trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR echo "===== E2E COMPLETE =====" -if [ "${TEST_EXIT_CODE:-2}" -ne 0 ]; then - echo "❌ E2E tests failed with exit code $TEST_EXIT_CODE (pod/test-pod failed)" +if [ "${TEST_EXIT_CODE:-1}" -ne 0 ]; then + echo "❌ E2E tests failed with exit code $TEST_EXIT_CODE" else echo "✅ E2E tests succeeded" fi diff --git a/tests/e2e-prow/rhoai/run-tests.sh b/tests/e2e-prow/rhoai/run-tests.sh old mode 100644 new mode 100755 index 657b8124b..41f5020e3 --- a/tests/e2e-prow/rhoai/run-tests.sh +++ b/tests/e2e-prow/rhoai/run-tests.sh @@ -1,17 +1,50 @@ -git clone https://github.com/lightspeed-core/lightspeed-stack.git -cd lightspeed-stack +#!/bin/bash +set -e -echo "pod started" -echo $E2E_LSC_HOSTNAME +# Go to repo root (run-tests.sh is in tests/e2e-prow/rhoai/) +cd "$(dirname "$0")/../../.." -curl -f http://$E2E_LSC_HOSTNAME:8080/v1/models || { - echo "❌ Basic connectivity failed - showing logs before running full tests" +# Timestamps to pinpoint where time is spent (e.g. if Prow 2h timeout is hit) +ts() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $*"; } + +# FAISS_VECTOR_STORE_ID should be exported by pipeline.sh +if [ -z "$FAISS_VECTOR_STORE_ID" ]; then + echo "❌ FAISS_VECTOR_STORE_ID is not set - should be exported by pipeline.sh" exit 1 -} +fi + +ts "Start run-tests.sh" +echo "Running tests from: $(pwd)" +echo "E2E_LSC_HOSTNAME: $E2E_LSC_HOSTNAME" +echo "FAISS_VECTOR_STORE_ID: $FAISS_VECTOR_STORE_ID" +# Wait for service to be ready (retry up to 60 seconds) +ts "Start: wait for service" +echo "Waiting for service to be ready..." +for i in $(seq 1 12); do + if curl -sf http://$E2E_LSC_HOSTNAME:8080/v1/models > /dev/null 2>&1; then + echo "✅ Service is responding" + break + fi + if [ $i -eq 12 ]; then + echo "❌ Basic connectivity failed after 60 seconds" + exit 1 + fi + echo " Attempt $i/12 - service not ready, waiting 5s..." + sleep 5 +done +ts "End: wait for service" + +ts "Start: pip install uv" echo "Installing test dependencies..." pip install uv +ts "End: pip install uv" + +ts "Start: uv sync" uv sync +ts "End: uv sync" -echo "Running comprehensive e2e test suite..." -make test-e2e \ No newline at end of file +ts "Start: make test-e2e" +echo "Running e2e test suite..." +make test-e2e +ts "End: make test-e2e" diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh new file mode 100755 index 000000000..0beb9e28c --- /dev/null +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# Consolidated E2E operations script for OpenShift/Prow environment +# Usage: e2e-ops.sh [args...] +# +# Commands: +# restart-lightspeed - Restart lightspeed-stack pod and port-forward +# restart-llama-stack - Restart/restore llama-stack pod +# restart-port-forward - Re-establish port-forward for lightspeed +# wait-for-pod [attempts] - Wait for a pod to be ready +# update-configmap - Update ConfigMap from file +# get-configmap-content - Get ConfigMap content (outputs to stdout) +# disrupt-llama-stack - Delete llama-stack pod to disrupt connection + +set -e + +NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +MANIFEST_DIR="$SCRIPT_DIR/../manifests/lightspeed" + +# ============================================================================ +# Helper functions +# ============================================================================ + +wait_for_pod() { + local pod_name="$1" + local max_attempts="${2:-24}" + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + local ready + ready=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") + if [[ "$ready" == "true" ]]; then + echo "✓ Pod $pod_name ready" + return 0 + fi + sleep 3 + done + + echo "Pod $pod_name not ready after $((max_attempts * 3))s" + return 1 +} + +verify_connectivity() { + local max_attempts="${1:-6}" + local local_port="${LOCAL_PORT:-8080}" + local http_code="" + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + # Check readiness endpoint - accept 200 or 401 (auth required but service is up) + http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$local_port/readiness" 2>/dev/null) || http_code="000" + + if [[ "$http_code" == "200" || "$http_code" == "401" ]]; then + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + sleep 2 + fi + done + + echo "Connectivity check failed (HTTP: ${http_code:-unknown})" + return 1 +} + +# ============================================================================ +# Command implementations +# ============================================================================ + +cmd_restart_lightspeed() { + echo "Restarting lightspeed-stack service..." + + # Delete existing pod with timeout + timeout 60 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true || { + oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true + sleep 2 + } + + # Apply manifest + oc apply -f "$MANIFEST_DIR/lightspeed-stack.yaml" + + # Wait for pod to be ready + wait_for_pod "lightspeed-stack-service" 20 + + # Re-label pod for service discovery + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + + # Re-establish port-forward + cmd_restart_port_forward + + echo "✓ Lightspeed restart complete" +} + +cmd_restart_llama_stack() { + echo "===== Restoring llama-stack service =====" + + # Apply manifest (creates pod if not exists) + # Use envsubst to expand ${LLAMA_STACK_IMAGE} and other env vars + echo "Applying pod manifest..." + envsubst < "$MANIFEST_DIR/llama-stack.yaml" | oc apply -f - + + # Wait for pod to be ready + wait_for_pod "llama-stack-service" 24 + + # Re-label pod for service discovery + echo "Labeling pod for service..." + oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite + + echo "===== Llama-stack restore complete =====" +} + +cmd_restart_port_forward() { + local local_port="${LOCAL_PORT:-8080}" + local remote_port="${REMOTE_PORT:-8080}" + local max_attempts=3 + + echo "Re-establishing port-forward on $local_port:$remote_port..." + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + # Kill existing port-forward processes + pkill -9 -f "oc port-forward.*lightspeed" 2>/dev/null || true + sleep 1 + + # Start new port-forward in background + nohup oc port-forward svc/lightspeed-stack-service-svc "$local_port:$remote_port" -n "$NAMESPACE" > /tmp/port-forward.log 2>&1 & + local pf_pid=$! + disown $pf_pid 2>/dev/null || true + sleep 5 + + # Verify connectivity (more attempts for larger models) + if verify_connectivity 10; then + echo "✓ Port-forward established (PID: $pf_pid)" + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + echo "Attempt $attempt failed, retrying..." + sleep 3 + fi + done + + echo "Failed to establish port-forward" + cat /tmp/port-forward.log 2>/dev/null | tail -5 || true + return 1 +} + +cmd_wait_for_pod() { + local pod_name="${1:?Pod name required}" + local max_attempts="${2:-24}" + wait_for_pod "$pod_name" "$max_attempts" +} + +cmd_update_configmap() { + local configmap_name="${1:?ConfigMap name required}" + local source_file="${2:?Source file required}" + + echo "Updating ConfigMap $configmap_name from $source_file..." + + # Delete existing configmap + oc delete configmap "$configmap_name" -n "$NAMESPACE" --ignore-not-found=true + + # Create new configmap from the source file + oc create configmap "$configmap_name" -n "$NAMESPACE" \ + --from-file="lightspeed-stack.yaml=$source_file" + + echo "✓ ConfigMap $configmap_name updated successfully" +} + +cmd_get_configmap_content() { + local configmap_name="${1:?ConfigMap name required}" + oc get configmap "$configmap_name" -n "$NAMESPACE" \ + -o 'jsonpath={.data.lightspeed-stack\.yaml}' +} + +cmd_disrupt_llama_stack() { + local pod_name="llama-stack-service" + + # Check if pod exists and is running + local phase + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + + if [[ "$phase" == "Running" ]]; then + # Delete the pod to disrupt connection + oc delete pod "$pod_name" -n "$NAMESPACE" --wait=true + sleep 2 + echo "Llama Stack connection disrupted successfully (pod deleted)" + exit 0 + else + echo "Llama Stack pod was not running (phase: $phase)" + exit 2 + fi +} + +# ============================================================================ +# Main command dispatcher +# ============================================================================ + +COMMAND="${1:-}" +shift || true + +case "$COMMAND" in + restart-lightspeed) + cmd_restart_lightspeed + ;; + restart-llama-stack) + cmd_restart_llama_stack + ;; + restart-port-forward) + cmd_restart_port_forward + ;; + wait-for-pod) + cmd_wait_for_pod "$@" + ;; + update-configmap) + cmd_update_configmap "$@" + ;; + get-configmap-content) + cmd_get_configmap_content "$@" + ;; + disrupt-llama-stack) + cmd_disrupt_llama_stack + ;; + *) + echo "Usage: $0 [args...]" + echo "" + echo "Commands:" + echo " restart-lightspeed - Restart lightspeed-stack pod and port-forward" + echo " restart-llama-stack - Restart/restore llama-stack pod" + echo " restart-port-forward - Re-establish port-forward for lightspeed" + echo " wait-for-pod [attempts] - Wait for a pod to be ready" + echo " update-configmap - Update ConfigMap from file" + echo " get-configmap-content - Get ConfigMap content (outputs to stdout)" + echo " disrupt-llama-stack - Delete llama-stack pod to disrupt connection" + exit 1 + ;; +esac diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 3d6a4fdae..543803f4c 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -13,10 +13,12 @@ import requests from behave.model import Feature, Scenario +from tests.e2e.utils.prow_utils import restore_llama_stack_pod from behave.runner import Context from tests.e2e.utils.utils import ( create_config_backup, + is_prow_environment, remove_config_backup, restart_container, switch_config, @@ -25,6 +27,38 @@ FALLBACK_MODEL = "gpt-4o-mini" FALLBACK_PROVIDER = "openai" +# Config file mappings: config_name -> (docker_path, prow_path) +_CONFIG_PATHS = { + "no-cache": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-no-cache.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml", + ), + "auth-noop-token": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-noop-token.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml", + ), + "rbac": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-rbac.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml", + ), + "invalid-feedback-storage": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-invalid-feedback-storage.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml", + ), + "rh-identity": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-rh-identity.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml", + ), +} + + +def _get_config_path(config_name: str, mode_dir: str) -> str: + """Get the appropriate config path based on environment.""" + docker_path_template, prow_path = _CONFIG_PATHS[config_name] + if is_prow_environment(): + return prow_path + return docker_path_template.format(mode_dir=mode_dir) + def _fetch_models_from_service() -> dict: """Query /v1/models endpoint and return first LLM model. @@ -138,11 +172,11 @@ def before_scenario(context: Context, scenario: Scenario) -> None: mode_dir = "library-mode" if context.is_library_mode else "server-mode" if "InvalidFeedbackStorageConfig" in scenario.effective_tags: - context.scenario_config = f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-invalid-feedback-storage.yaml" + context.scenario_config = _get_config_path("invalid-feedback-storage", mode_dir) if "NoCacheConfig" in scenario.effective_tags: - context.scenario_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-no-cache.yaml" - ) + context.scenario_config = _get_config_path("no-cache", mode_dir) + switch_config(context.scenario_config) + restart_container("lightspeed-stack") def after_scenario(context: Context, scenario: Scenario) -> None: @@ -171,63 +205,67 @@ def after_scenario(context: Context, scenario: Scenario) -> None: scenario-specific teardown actions to run (e.g., "InvalidFeedbackStorageConfig", "NoCacheConfig"). """ - if "InvalidFeedbackStorageConfig" in scenario.effective_tags: - switch_config(context.feature_config) - restart_container("lightspeed-stack") - if "NoCacheConfig" in scenario.effective_tags: + # Restore Llama Stack FIRST (before any lightspeed-stack restart) + llama_was_running = getattr(context, "llama_stack_was_running", False) + if llama_was_running: + _restore_llama_stack(context) + context.llama_stack_was_running = False + + # Tags that require config restoration after scenario + config_restore_tags = {"InvalidFeedbackStorageConfig", "NoCacheConfig"} + if config_restore_tags & set(scenario.effective_tags): switch_config(context.feature_config) restart_container("lightspeed-stack") - # Restore Llama Stack connection if it was disrupted (only in server mode) - if ( - not context.is_library_mode - and hasattr(context, "llama_stack_was_running") - and context.llama_stack_was_running - ): - try: - # Start the llama-stack container again - subprocess.run( - ["docker", "start", "llama-stack"], check=True, capture_output=True - ) - # Wait for the service to be healthy - print("Restoring Llama Stack connection...") - time.sleep(20) - - # Check if it's healthy - for attempt in range(6): # Try for 30 seconds - try: - result = subprocess.run( - [ - "docker", - "exec", - "llama-stack", - "curl", - "-f", - f"http://{context.hostname_llama}:{context.port_llama}/v1/health", - ], - capture_output=True, - timeout=5, - check=True, - ) - if result.returncode == 0: - print("✓ Llama Stack connection restored successfully") - break - except subprocess.TimeoutExpired: - print(f"⏱Health check timed out on attempt {attempt + 1}/6") - - if attempt < 5: - print( - f"Waiting for Llama Stack to be healthy... (attempt {attempt + 1}/6)" - ) - time.sleep(5) - else: - print( - "Warning: Llama Stack may not be fully healthy after restoration" - ) - - except subprocess.CalledProcessError as e: - print(f"Warning: Could not restore Llama Stack connection: {e}") +def _restore_llama_stack(context: Context) -> None: + """Restore Llama Stack connection after disruption.""" + if is_prow_environment(): + restore_llama_stack_pod() + return + + try: + # Start the llama-stack container again + subprocess.run( + ["docker", "start", "llama-stack"], check=True, capture_output=True + ) + + # Wait for the service to be healthy + print("Restoring Llama Stack connection...") + time.sleep(20) + + # Check if it's healthy + for attempt in range(6): # Try for 30 seconds + try: + result = subprocess.run( + [ + "docker", + "exec", + "llama-stack", + "curl", + "-f", + f"http://{context.hostname_llama}:{context.port_llama}/v1/health", + ], + capture_output=True, + timeout=5, + check=True, + ) + if result.returncode == 0: + print("✓ Llama Stack connection restored successfully") + break + except subprocess.TimeoutExpired: + print(f"⏱ Health check timed out on attempt {attempt + 1}/6") + + if attempt < 5: + print( + f"Waiting for Llama Stack to be healthy... (attempt {attempt + 1}/6)" + ) + time.sleep(5) + else: + print("Warning: Llama Stack may not be fully healthy after restoration") + + except subprocess.CalledProcessError as e: + print(f"Warning: Could not restore Llama Stack connection: {e}") def before_feature(context: Context, feature: Feature) -> None: @@ -235,29 +273,21 @@ def before_feature(context: Context, feature: Feature) -> None: Prepare per-feature test environment and apply feature-specific configuration. """ + mode_dir = "library-mode" if context.is_library_mode else "server-mode" if "Authorized" in feature.tags: - mode_dir = "library-mode" if context.is_library_mode else "server-mode" - context.feature_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-noop-token.yaml" - ) + context.feature_config = _get_config_path("auth-noop-token", mode_dir) context.default_config_backup = create_config_backup("lightspeed-stack.yaml") switch_config(context.feature_config) restart_container("lightspeed-stack") if "RBAC" in feature.tags: - mode_dir = "library-mode" if context.is_library_mode else "server-mode" - context.feature_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-rbac.yaml" - ) + context.feature_config = _get_config_path("rbac", mode_dir) context.default_config_backup = create_config_backup("lightspeed-stack.yaml") switch_config(context.feature_config) restart_container("lightspeed-stack") if "RHIdentity" in feature.tags: - mode_dir = "library-mode" if context.is_library_mode else "server-mode" - context.feature_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-rh-identity.yaml" - ) + context.feature_config = _get_config_path("rh-identity", mode_dir) context.default_config_backup = create_config_backup("lightspeed-stack.yaml") switch_config(context.feature_config) restart_container("lightspeed-stack") @@ -292,6 +322,5 @@ def after_feature(context: Context, feature: Feature) -> None: if "Feedback" in feature.tags: for conversation_id in context.feedback_conversations: url = f"http://{context.hostname}:{context.port}/v1/conversations/{conversation_id}" - headers = context.auth_headers if hasattr(context, "auth_headers") else {} - response = requests.delete(url, headers=headers) - assert response.status_code == 200, url + response = requests.delete(url, timeout=10) + assert response.status_code == 200, f"{url} returned {response.status_code}" diff --git a/tests/e2e/features/steps/health.py b/tests/e2e/features/steps/health.py index 06cd4bb9d..eefbeef04 100644 --- a/tests/e2e/features/steps/health.py +++ b/tests/e2e/features/steps/health.py @@ -5,6 +5,8 @@ from behave import given # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context +from tests.e2e.utils.utils import is_prow_environment + @given("The llama-stack connection is disrupted") def llama_stack_connection_broken(context: Context) -> None: @@ -27,6 +29,13 @@ def llama_stack_connection_broken(context: Context) -> None: # Store original state for restoration context.llama_stack_was_running = False + if is_prow_environment(): + from tests.e2e.utils.prow_utils import disrupt_llama_stack_pod + + context.llama_stack_was_running = disrupt_llama_stack_pod() + return + + # Docker-based disruption try: result = subprocess.run( ["docker", "inspect", "-f", "{{.State.Running}}", "llama-stack"], diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py index 732f6e291..6e39b2a7c 100644 --- a/tests/e2e/features/steps/llm_query_response.py +++ b/tests/e2e/features/steps/llm_query_response.py @@ -1,12 +1,14 @@ """LLM query and response steps.""" import json +import os import requests from behave import then, step # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context from tests.e2e.utils.utils import replace_placeholders -DEFAULT_LLM_TIMEOUT = 60 +# Longer timeout for Prow/OpenShift with CPU-based vLLM +DEFAULT_LLM_TIMEOUT = 180 if os.getenv("RUNNING_PROW") else 60 @step("I wait for the response to be completed") diff --git a/tests/e2e/features/steps/rbac.py b/tests/e2e/features/steps/rbac.py index babaa0be5..4a8cb3e6d 100644 --- a/tests/e2e/features/steps/rbac.py +++ b/tests/e2e/features/steps/rbac.py @@ -7,7 +7,10 @@ def get_test_tokens() -> dict[str, str]: - """Fetch test tokens from the mock JWKS server.""" + """Fetch test tokens from the mock JWKS server. + + In Prow environment, mock-jwks is port-forwarded to localhost:8000. + """ jwks_host = os.getenv("E2E_JWKS_HOSTNAME", "localhost") jwks_port = os.getenv("E2E_JWKS_PORT", "8000") tokens_url = f"http://{jwks_host}:{jwks_port}/tokens" diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py new file mode 100644 index 000000000..6f138fe15 --- /dev/null +++ b/tests/e2e/utils/prow_utils.py @@ -0,0 +1,215 @@ +"""Prow/OpenShift-specific utility functions for E2E tests. + +This module contains all functions that interact with OpenShift via the `oc` CLI +and are only used when running tests in the Prow CI environment. +""" + +import os +import subprocess +import tempfile + + +def get_namespace() -> str: + """Get the Kubernetes namespace for Prow environment.""" + return os.getenv("NAMESPACE", "e2e-rhoai-dsc") + + +# Mapping from container names (used in tests) to pod names (used in OpenShift) +_POD_NAME_MAP = { + "lightspeed-stack": "lightspeed-stack-service", + "llama-stack": "llama-stack-service", +} + + +def get_pod_name(container_name: str) -> str: + """Map container name to OpenShift pod name.""" + return _POD_NAME_MAP.get(container_name, container_name) + + +def _get_e2e_ops_script() -> str: + """Get the path to the consolidated e2e-ops.sh script.""" + tests_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + return os.path.join(tests_dir, "e2e-prow/rhoai/scripts/e2e-ops.sh") + + +def run_e2e_ops( + command: str, args: list[str] | None = None, timeout: int = 180 +) -> subprocess.CompletedProcess: + """Run a command via the consolidated e2e-ops.sh script. + + Args: + command: The command to run (e.g., "restart-lightspeed", "wait-for-pod"). + args: Optional list of arguments to pass to the command. + timeout: Timeout in seconds. + + Returns: + CompletedProcess object with stdout/stderr. + """ + script_path = _get_e2e_ops_script() + cmd = ["bash", script_path, command] + (args or []) + return subprocess.run( + cmd, + env={**os.environ, "NAMESPACE": get_namespace()}, + capture_output=True, + text=True, + timeout=timeout, + ) + + +def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None: + """Wait for pod to be ready in OpenShift/Prow environment.""" + actual_pod_name = get_pod_name(pod_name) + try: + result = run_e2e_ops("wait-for-pod", [actual_pod_name, str(max_attempts)]) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + raise subprocess.CalledProcessError(+result.returncode, "wait-for-pod") + except subprocess.TimeoutExpired: + print(f"Timeout waiting for pod {actual_pod_name}") + raise + + +def restart_pod(container_name: str) -> None: + """Restart lightspeed-stack pod in OpenShift/Prow environment.""" + try: + result = run_e2e_ops("restart-lightspeed", timeout=120) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + raise subprocess.CalledProcessError(result.returncode, "restart-lightspeed") + except subprocess.TimeoutExpired as e: + print(f"Failed to restart pod {container_name}: {e}") + raise + + +def restore_llama_stack_pod() -> None: + """Restore Llama Stack pod in Prow/OpenShift environment.""" + try: + result = run_e2e_ops("restart-llama-stack", timeout=180) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + else: + print("✓ Llama Stack pod restored successfully") + except subprocess.TimeoutExpired: + print("Warning: Timeout while restoring Llama Stack pod") + + +def disrupt_llama_stack_pod() -> bool: + """Disrupt llama-stack connection in Prow/OpenShift environment. + + Returns: + True if the pod was running and has been disrupted, False otherwise. + """ + try: + result = run_e2e_ops("disrupt-llama-stack", timeout=90) + print(result.stdout, end="") + + # Exit code 0 = disrupted (was running), exit code 2 = was not running + if result.returncode == 0: + return True + elif result.returncode == 2: + return False + else: + print(result.stderr, end="") + return False + + except subprocess.TimeoutExpired: + print("Warning: Timeout while disrupting Llama Stack connection") + return False + + +# In-memory storage for ConfigMap backups in Prow environment +_configmap_backups: dict[str, str] = {} + + +def backup_configmap_to_memory() -> str: + """Backup the current ConfigMap content to memory.""" + namespace = get_namespace() + configmap_name = "lightspeed-stack-config" + backup_key = f"{namespace}/{configmap_name}" + + if backup_key in _configmap_backups: + print(f"ConfigMap backup already exists for {backup_key}") + return backup_key + + print(f"Backing up ConfigMap {configmap_name} to memory...") + + try: + result = run_e2e_ops("get-configmap-content", [configmap_name], timeout=30) + if result.returncode != 0: + raise subprocess.CalledProcessError( + result.returncode, "get-configmap-content", result.stderr + ) + + _configmap_backups[backup_key] = result.stdout + print(f"ConfigMap backed up to memory ({len(result.stdout)} bytes)") + return backup_key + + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + print(f"Failed to backup ConfigMap: {e}") + raise + + +def remove_configmap_backup(backup_key: str) -> None: + """Remove a ConfigMap backup from memory.""" + if backup_key in _configmap_backups: + del _configmap_backups[backup_key] + print(f"ConfigMap backup {backup_key} removed from memory") + + +def _recreate_configmap(configmap_name: str, source_file: str) -> None: + """Delete and recreate a ConfigMap from a file. + + Args: + configmap_name: Name of the ConfigMap. + source_file: Path to the file to create the ConfigMap from. + """ + result = run_e2e_ops("update-configmap", [configmap_name, source_file], timeout=60) + if result.returncode != 0: + raise subprocess.CalledProcessError( + result.returncode, "update-configmap", result.stderr + ) + + +def update_config_configmap(source: str) -> None: + """Update the lightspeed-stack-config ConfigMap with new config in Prow environment. + + Args: + source: Either a file path or a backup key from _configmap_backups. + """ + configmap_name = "lightspeed-stack-config" + + # Check if source is a backup key (restore from memory) + if source in _configmap_backups: + config_content = _configmap_backups[source] + print(f"Restoring ConfigMap {configmap_name} from memory backup...") + + # Write content to temp file (oc create configmap requires a file) + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(config_content) + temp_path = f.name + + try: + _recreate_configmap(configmap_name, temp_path) + print(f"✓ ConfigMap {configmap_name} restored successfully") + except subprocess.CalledProcessError as e: + print(f"Failed to restore ConfigMap: {e}") + raise + finally: + if os.path.exists(temp_path): + os.remove(temp_path) + return + + # Otherwise, source is a file path + print(f"Updating ConfigMap {configmap_name} with config from {source}...") + + try: + _recreate_configmap(configmap_name, source) + print(f"ConfigMap {configmap_name} updated successfully") + except subprocess.CalledProcessError as e: + print(f"Failed to update ConfigMap: {e}") + raise diff --git a/tests/e2e/utils/utils.py b/tests/e2e/utils/utils.py index 9c11dd9a6..6b73a0e1f 100644 --- a/tests/e2e/utils/utils.py +++ b/tests/e2e/utils/utils.py @@ -9,6 +9,19 @@ import jsonschema from behave.runner import Context +from tests.e2e.utils.prow_utils import ( + backup_configmap_to_memory, + remove_configmap_backup, + restart_pod, + update_config_configmap, + wait_for_pod_health, +) + + +def is_prow_environment() -> bool: + """Check if running in Prow/OpenShift environment.""" + return os.getenv("RUNNING_PROW") is not None + def normalize_endpoint(endpoint: str) -> str: """Normalize endpoint to be added into the URL. @@ -78,6 +91,10 @@ def wait_for_container_health(container_name: str, max_attempts: int = 3) -> Non container_name (str): Docker container name or ID to check. max_attempts (int): Maximum number of health check attempts (default 3). """ + if is_prow_environment(): + wait_for_pod_health(container_name, max_attempts) + return + for attempt in range(max_attempts): try: result = subprocess.run( @@ -167,6 +184,10 @@ def switch_config( written due to permissions. OSError: For other OS-related failures during the copy operation. """ + if is_prow_environment(): + update_config_configmap(source_path) + return + try: shutil.copy(source_path, destination_path) except (FileNotFoundError, PermissionError, OSError) as e: @@ -188,6 +209,9 @@ def create_config_backup(config_path: str) -> str: PermissionError: If the process lacks permission to read or write the files. OSError: For other OS-level errors encountered while copying. """ + if is_prow_environment(): + return backup_configmap_to_memory() + backup_file = f"{config_path}.backup" if not os.path.exists(backup_file): try: @@ -211,6 +235,10 @@ def remove_config_backup(backup_path: str) -> None: Parameters: backup_path (str): Filesystem path to the backup file to remove. """ + if is_prow_environment(): + remove_configmap_backup(backup_path) + return + if os.path.exists(backup_path): try: os.remove(backup_path) @@ -228,6 +256,10 @@ def restart_container(container_name: str) -> None: subprocess.CalledProcessError: if the `docker restart` command fails. subprocess.TimeoutExpired: if the `docker restart` command times out. """ + if is_prow_environment(): + restart_pod(container_name) + return + try: subprocess.run( ["docker", "restart", container_name],