InternRobotics · kew6688 · May 18, 2026
diff --git a/baselines/InternVLA-A1/README.md b/baselines/InternVLA-A1/README.md
@@ -27,6 +27,13 @@ Then update the checkpoint path used by `eval_pjsim.sh` (or pass it via script a
 
 ## 3) Run evaluation
 
+Please install genmanip-client sdk into the InternVLA-A1 environment.
+```
+# install the EBench evaluation client
+git clone https://github.com/InternRobotics/genmanip-client.git
+cd genmanip-client && pip install -e .[full_numpy2] && cd -
+```
+
 From this directory, run:
 
 ```bash
@@ -35,7 +42,60 @@ bash eval_pjsim.sh
 
 The script will launch `inference.py` and start evaluation.
 
+## 4) Quick start: single worker
+
+```bash
+python inference.py \
+  --ckpt_path /mnt/data/wangyukai/github/EBench/checkpoints/EBench-Generalist-InternVLA-A1 \
+  --url "$BASE_URL" \
+  --run_id "$RUN_ID" \
+  --token "$TOKEN" \
+  --worker_ids 0
+```
+
+`--url` should point to the remote EBench evaluation server. If `--url`
+is empty, `inference.py` falls back to `http://{host}:{port}`.
+
+## 5) Multi-worker / multi-host: `run_internvla_eval.sh`
+
+`scripts/run_internvla_eval.sh` launches one `inference.py` process per
+worker id, writes each worker's stdout/stderr to a separate log file, and
+forwards SIGINT/SIGTERM to all children for clean shutdown.
+
+```bash
+CKPT_PATH=/you/checkpoints/EBench-Generalist-InternVLA-A1 \
+BASE_URL=https://your-ebench-server.example.com \
+RUN_ID=my_run_2026_04_29 \
+TOKEN=$EBENCH_TOKEN \
+WORKER_IDS=0,1,2,3 \
+GPU_IDS=0,1,2,3 \
+LOG_DIR=log_dir/internvla-a1-generalist \
+bash /mnt/data/wangyukai/github/EBench/scripts/run_internvla_eval.sh
+```
+
+## 6) CLI reference (`inference.py`)
+
+| Flag | Default | Description |
+| ---- | ------- | ----------- |
+| `--ckpt_path` | _req._ | Local InternVLA-A1 checkpoint directory. Must contain the policy config, weights, and `stats.json`. |
+| `--stats_key` | `lift2` | Key inside `stats.json` used for state/action normalization statistics. |
+| `--resize_size` | `224` | Image resize target used by `ResizeImagesWithPadFn`. |
+| `--image_history_interval` | `15` | Interval used to pick the historical frame paired with the current frame. |
+| `--action_mode` | `delta` | `delta`: add predicted joint deltas to the current joint state. `abs`: use predicted actions directly. |
+| `--dtype` | `float32` | Policy inference dtype. Supported by the script: `float32` or `bfloat16`. |
+| `--decode_image` | `False` | Pass through to `policy.predict_action_chunk(..., decode_image=...)`. Usually leave disabled for evaluation. |
+| `--infer_horizon` | `50` | Number of predicted action steps kept from the model output before final truncation. |
+| `--action_horizon_size` | `30` | Number of action steps deployed from each planned chunk. |
+| `--worker_ids` | `0` | Comma-separated worker ids handled by this process, e.g. `0,1,2`. For multi-process launch, use one id per process. |
+| `--url` | `""` | Full EBench evaluation server URL. Overrides `--host` and `--port` when set. |
+| `--host` | `0.0.0.0` | Host used to build `http://{host}:{port}` when `--url` is not set. |
+| `--port` | `8087` | Port used to build `http://{host}:{port}` when `--url` is not set. |
+| `--run_id` | `""` | Identifier of the evaluation run; shared across all workers and hosts for the same run. |
+| `--token` | `""` | Auth token issued by the EBench server. |
+
+
 ## Notes
 
 - Ensure your environment variables in `eval_pjsim.sh` are set correctly (for example `HF_HOME`).
+- First run may require downloading checkpoints and tokenizer files from Hugging Face.
 - If you are using offline mode, keep `HF_HUB_OFFLINE=1` and `TRANSFORMERS_OFFLINE=1` only after required files are fully downloaded.
diff --git a/baselines/InternVLA-A1/eval_pjsim.sh b/baselines/InternVLA-A1/eval_pjsim.sh
@@ -1,7 +1,7 @@
 export LD_LIBRARY_PATH="/opt/libjpeg-turbo/lib64:/opt/libjpeg-turbo/lib:${LD_LIBRARY_PATH-}"
-export GENMANIP_RESULT_DIR=./evaluation/genmanip
-export HF_HOME=/your/hf_home
-export HF_HUB_OFFLINE=1
-export TRANSFORMERS_OFFLINE=1
+export GENMANIP_RESULT_DIR=./client_results
+export HF_HOME=./hf_home
+export HF_HUB_OFFLINE=0
+export TRANSFORMERS_OFFLINE=0
 
-python inference.py --ckpt_path ./checkpoints/EBench-Generalist-InternVLA-A1 --worker_ids 0
+python inference.py --ckpt_path your/path/to/checkpoints/EBench-Generalist-InternVLA-A1 --worker_ids 0
diff --git a/baselines/InternVLA-A1/inference.py b/baselines/InternVLA-A1/inference.py
@@ -78,6 +78,9 @@ class InferenceArgs:
     worker_ids: str = "0"  # Comma-separated worker IDs, e.g. "0,1,2"
     host: str = "0.0.0.0"
     port: int = 8087
+    url: str = ""  # Optional custom URL for EvalClient connection, overrides host and port if provided.
+    token: str = ""
+    run_id: str = ""
 
 
 class QwenA1PolicyWrapper:
@@ -124,9 +127,9 @@ def get_action(self, obs: dict) -> dict:
             predicted_rel_base = action[16:]
             target_base_abs = self.chunk_start_base + predicted_rel_base
             base_motion = target_base_abs - curr_base
-            output["action"] = joints_gripper
+            output["action"] = joints_gripper.tolist()
             output['is_rel'] = False
-            output["base_motion"] = base_motion
+            output["base_motion"] = base_motion.tolist()
             output['base_is_rel'] = True
             output["control_type"] = "joint_position"
             return output
@@ -143,9 +146,9 @@ def get_action_chunk(self, obs: dict) -> list[dict]:
             joints_gripper = self._pack_action_fields(action)
             predicted_rel_base = action[16:]
             target_base_abs = curr_base + predicted_rel_base
-            action_step["action"] = joints_gripper
+            action_step["action"] = joints_gripper.tolist()
             action_step['is_rel'] = False
-            action_step["base_motion"] = target_base_abs
+            action_step["base_motion"] = target_base_abs.tolist()
             action_step['base_is_rel'] = False
             action_step["control_type"] = "joint_position"
             action_chunk.append(action_step)
@@ -200,7 +203,14 @@ def _plan_action(self, obs: dict) -> None:
                     image = sample[key].permute(0, 3, 1, 2)
                     sample[key] = image
 
-            sample = self.input_transforms(sample)
+            for transform in self.input_transforms.transforms:
+                if isinstance(transform, Qwen3_VLProcessorTransformFn):
+                    sample.update({
+                        f"{OBS_IMAGES}.image0_mask": torch.tensor([True]).cuda(),
+                        f"{OBS_IMAGES}.image1_mask": torch.tensor([True]).cuda(),
+                        f"{OBS_IMAGES}.image2_mask": torch.tensor([True]).cuda(),
+                    })
+                sample = transform(sample)
 
             inputs = {}
             for key in sample.keys():
@@ -247,9 +257,15 @@ def _plan_action(self, obs: dict) -> None:
     if not worker_ids:
         raise ValueError("`worker_ids` must contain at least one valid worker id.")
 
-    base_url = f"http://{args.host}:{args.port}"
+    if args.url:
+        base_url = args.url
+    else:
+        base_url = f"http://{args.host}:{args.port}"
+
     client = EvalClient(
-        base_url,
+        base_url=base_url,
+        token=args.token,
+        run_id=args.run_id,
         worker_ids=worker_ids
     )
     policy_list = [QwenA1PolicyWrapper(args) for _ in worker_ids]
@@ -263,7 +279,20 @@ def _plan_action(self, obs: dict) -> None:
                 worker_id: policy.get_action(obs[worker_id])
                 for worker_id, policy in zip(worker_ids, policy_list)
             }
-            obs, done = client.step(action)
+            try:
+                obs, done = client.step(action)
+            except Exception as exc:
+                print(f"[warn] EvalClient step failed: {exc}", flush=True)
+                client.close()
+                client = EvalClient(
+                    base_url=base_url,
+                    token=args.token,
+                    run_id=args.run_id,
+                    worker_ids=worker_ids
+                )
+                obs = client.reset()
+                policy.reset()
+                break
             if done:
                 break
 

diff --git a/baselines/X-VLA/README.md b/baselines/X-VLA/README.md
@@ -39,7 +39,7 @@ pip install -r EBench/baselines/X-VLA/requirements.txt
 
 # install the EBench evaluation client
 git clone https://github.com/InternRobotics/genmanip-client.git
-cd genmanip-client && pip install -e . && cd -
+cd genmanip-client && pip install -e .[full_numpy1] && cd -
 ```
 
 > `transformers<=4.51.3` is pinned because X-VLA's custom modeling code

diff --git a/baselines/openpi/README.md b/baselines/openpi/README.md
@@ -6,13 +6,19 @@ This guide describes the minimal workflow for evaluating the post-trained OpenPI
 
 ### 1. Install OpenPI
 
-Install the official OpenPI repository located at:
+Install the official OpenPI repository located at: `baselines/openpi/third_party/openpi`. 
+Please refer to the official OpenPI README for detailed setup instructions.
 
-```bash
-baselines/openpi/third_party/openpi
+i.e.
 ```
+cd baselines/openpi/third_party/openpi
+GIT_LFS_SKIP_SMUDGE=1 uv sync
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e .
 
-Please refer to the official OpenPI README for detailed setup instructions.
+# install genmanip client for ebench evaluation
+cd path/to/genmanip-client
+uv pip install -e .[full_numpy1]
+```
 
 ### 2. Add EBench-Specific Files
 
@@ -25,17 +31,8 @@ baselines/openpi/scripts/
 
 Layer these files on top of the official OpenPI codebase. This can be done by either updating `PYTHONPATH` or copying the files to the corresponding locations in the OpenPI repository.
 
-### 3. Configure Evaluation Settings
-
-Before running evaluation, modify the configuration file:
-
-```bash
-scripts/launch_pi_onlineeval.sh
-```
 
-Please make sure that the model path, dataset path, environment settings, and output directory are correctly specified.
-
-### 4. Download EBench Post-Trained Models
+### 3. Download EBench Post-Trained Models
 
 The post-trained OpenPI models on EBench are available at:
 
@@ -54,4 +51,6 @@ Launch the evaluation with:
 
 ```bash
 bash scripts/launch_pi_onlineeval.sh
-```
+```
+
+Please make sure that the model path, dataset path, environment settings, and output directory are correctly specified.
diff --git a/baselines/openpi/scripts/pi_eval_client_online.py b/baselines/openpi/scripts/pi_eval_client_online.py
@@ -110,13 +110,13 @@ def parse_args():
         obs = eval_client.reset()
         eval_finished = False
         while not eval_finished:
-
-            if obs[ids]["obs"]["reset"]:
-                pass
             action_chunk = pi0_client.get_action(obs)
-            obs, eval_finished = eval_client.step(action_chunk)
+            try:
+                obs, eval_finished = eval_client.step(action_chunk)
+            except Exception as e:
+                eval_client.close()
+                eval_client = EvalClient(base_url=base_url, worker_ids=worker_ids, run_id=run_id, token=token)
+                obs = eval_client.reset()
 
     finally:
         eval_client.close()
-
-
diff --git a/scripts/run_xvla_eval.sh b/scripts/run_xvla_eval.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 # Usage: MODEL_PATH=... BASE_URL=... RUN_ID=... TOKEN=... \
-#        WORKER_IDS=0,1,2,3 ./EBench/scripts/run_xvla_eval.sh
+#        WORKER_IDS=0,1,2,3 GPU_IDS=0,1,2,3 ./EBench/scripts/run_xvla_eval.sh
 # Multi-host: share RUN_ID across hosts, give each host a disjoint WORKER_IDS slice.
 
 set -uo pipefail
@@ -11,6 +11,9 @@ STEP_MODE="${STEP_MODE:-step}"
 mkdir -p "$LOG_DIR"
 echo "[run_xvla_eval] logs -> $LOG_DIR"
 echo "[run_xvla_eval] step_mode=$STEP_MODE"
+if [[ -n "${GPU_IDS:-}" ]]; then
+    echo "[run_xvla_eval] gpu_ids=$GPU_IDS"
+fi
 
 pids=()
 
@@ -45,11 +48,20 @@ cleanup() {
 trap 'cleanup 130' INT TERM
 trap 'cleanup "$?"' EXIT
 
+worker_ids=(${WORKER_IDS//,/ })
+gpu_ids=(${GPU_IDS:-})
+if [[ -n "${GPU_IDS:-}" ]]; then
+    gpu_ids=(${GPU_IDS//,/ })
+fi
 
-
-for wid in ${WORKER_IDS//,/ }; do
+for i in "${!worker_ids[@]}"; do
+    wid="${worker_ids[$i]}"
     log="$LOG_DIR/worker_${wid}.log"
-    python -u "$RUN_PY" \
+    env_args=()
+    if (( ${#gpu_ids[@]} )); then
+        env_args=("CUDA_VISIBLE_DEVICES=${gpu_ids[$((i % ${#gpu_ids[@]}))]}")
+    fi
+    env "${env_args[@]}" python -u "$RUN_PY" \
         --model_path "$MODEL_PATH" \
         --base_url  "$BASE_URL" \
         --run_id    "$RUN_ID" \
@@ -58,7 +70,11 @@ for wid in ${WORKER_IDS//,/ }; do
         --step_mode "$STEP_MODE" \
         > "$log" 2>&1 &
     pids+=($!)
-    echo "[run_xvla_eval] worker=$wid pid=$! log=$log"
+    if (( ${#gpu_ids[@]} )); then
+        echo "[run_xvla_eval] worker=$wid gpu=${gpu_ids[$((i % ${#gpu_ids[@]}))]} pid=$! log=$log"
+    else
+        echo "[run_xvla_eval] worker=$wid pid=$! log=$log"
+    fi
 done
 
 rc=0