Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
490 changes: 456 additions & 34 deletions benchmark/profile_restful_api.py

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions lmdeploy/pytorch/engine/engine_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,12 @@ def _get_extra_outputs(self, resp: Response):
routed_experts = resp.data.get('routed_experts', None) if resp.data else None
if routed_experts is not None and resp.type in [ResponseType.FINISH, ResponseType.CANCEL]:
if self._enable_transfer_obj_ref:
import base64

import pybase64
import ray

ref = ray.put(routed_experts)
data = ray.cloudpickle.dumps(ref)
outputs['routed_experts'] = base64.b64encode(data).decode('utf-8')
outputs['routed_experts'] = pybase64.b64encode(data).decode('utf-8')
else:
outputs['routed_experts'] = routed_experts
return outputs
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/pytorch/engine/model_agent.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
import asyncio
import base64
import functools
import time
from contextlib import contextmanager
Expand All @@ -10,6 +9,7 @@
from typing import Any, Dict, List, Optional

import numpy as np
import pybase64
import torch
import torch.distributed as dist
from torch.profiler import ProfilerActivity, profile, record_function
Expand Down Expand Up @@ -1128,7 +1128,7 @@ def _construct(item):
if isinstance(serialized_data, list):
serialized_data = serialized_data[self.dist_ctx.tp_group.rank]
model = self.patched_model.get_model()
weights = ForkingPickler.loads(base64.b64decode(serialized_data))
weights = ForkingPickler.loads(pybase64.b64decode(serialized_data))
if request.load_format == 'flattened_bucket':
metadata: List[FlattenedTensorMetadata] = weights['metadata']
if metadata:
Expand Down
194 changes: 99 additions & 95 deletions lmdeploy/pytorch/models/qwen3_vl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Copyright (c) OpenMMLab. All rights reserved.

from functools import lru_cache
from typing import Any, Dict, Iterable, List, Optional, Tuple

import numpy as np
import torch
from torch import nn
from transformers.configuration_utils import PretrainedConfig
Expand Down Expand Up @@ -326,102 +328,104 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
for _ in range(len(config.deepstack_visual_indexes))
])

@staticmethod
@lru_cache(maxsize=1024)
def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor:
h_div = h // spatial_merge_size
w_div = w // spatial_merge_size

hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w))
hpos_ids = hpos_ids.reshape(
h_div,
spatial_merge_size,
w_div,
spatial_merge_size,
)
hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
hpos_ids = hpos_ids.flatten()

wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w))
wpos_ids = wpos_ids.reshape(
h_div,
spatial_merge_size,
w_div,
spatial_merge_size,
)
wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
wpos_ids = wpos_ids.flatten()

return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1))

def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
merge_size = self.spatial_merge_size

max_hw = int(grid_thw[:, 1:].max().item())
freq_table = self.rotary_pos_emb(max_hw) # (max_hw, dim // 2)
device = freq_table.device

total_tokens = int(torch.prod(grid_thw, dim=1).sum().item())
pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)

offset = 0
for num_frames, height, width in grid_thw:
merged_h, merged_w = height // merge_size, width // merge_size

block_rows = torch.arange(merged_h, device=device) # block row indices
block_cols = torch.arange(merged_w, device=device) # block col indices
intra_row = torch.arange(merge_size, device=device) # intra-block row offsets
intra_col = torch.arange(merge_size, device=device) # intra-block col offsets

# Compute full-resolution positions
row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]

row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)

coords = torch.stack((row_idx, col_idx), dim=-1)

if num_frames > 1:
coords = coords.repeat(num_frames, 1)

num_tokens = coords.shape[0]
pos_ids[offset:offset + num_tokens] = coords
offset += num_tokens

embeddings = freq_table[pos_ids] # lookup rotary embeddings
embeddings = embeddings.flatten(1)
return embeddings

def fast_pos_embed_interpolate(self, grid_thw):
grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]

idx_list = [[] for _ in range(4)]
weight_list = [[] for _ in range(4)]

for t, h, w in zip(grid_ts, grid_hs, grid_ws):
h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)

h_idxs_floor = h_idxs.int()
w_idxs_floor = w_idxs.int()
h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)

dh = h_idxs - h_idxs_floor
dw = w_idxs - w_idxs_floor

base_h = h_idxs_floor * self.num_grid_per_side
base_h_ceil = h_idxs_ceil * self.num_grid_per_side

indices = [
(base_h[None].T + w_idxs_floor[None]).flatten(),
(base_h[None].T + w_idxs_ceil[None]).flatten(),
(base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
(base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
]

weights = [
((1 - dh)[None].T * (1 - dw)[None]).flatten(),
((1 - dh)[None].T * dw[None]).flatten(),
(dh[None].T * (1 - dw)[None]).flatten(),
(dh[None].T * dw[None]).flatten(),
]

for i in range(4):
idx_list[i].extend(indices[i].tolist())
weight_list[i].extend(weights[i].tolist())

idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=self.pos_embed.weight.device)
weight_tensor = torch.tensor(weight_list,
dtype=self.pos_embed.weight.dtype,
device=self.pos_embed.weight.device)
pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]

patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])

patch_pos_embeds_permute = []
merge_size = self.config.spatial_merge_size
for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
pos_embed = pos_embed.repeat(t, 1)
pos_embed = (pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size,
-1).permute(0, 1, 3, 2, 4, 5).flatten(0, 4))
patch_pos_embeds_permute.append(pos_embed)
patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
return patch_pos_embeds
"""Rotary position embedding."""
pos_ids = []

for t, h, w in grid_thw:
base = self.rot_pos_ids(int(h), int(w), self.spatial_merge_size)
pos_ids.append(base if t == 1 else base.repeat(t, 1))

pos_ids = torch.cat(pos_ids, dim=0)
max_grid_size = grid_thw[:, 1:].max()
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)

return rotary_pos_emb

# copy from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen3_vl.py#L474
def fast_pos_embed_interpolate(self, grid_thw: List[List[int]]) -> torch.Tensor:
num_grid_per_side = self.num_grid_per_side
m_size = self.spatial_merge_size
hidden_dim = self.pos_embed.embedding_dim
device = self.pos_embed.weight.device

outputs = []
for t, h, w in grid_thw:
h_idxs = torch.linspace(0, num_grid_per_side - 1, h, dtype=torch.float32, device=device)
w_idxs = torch.linspace(0, num_grid_per_side - 1, w, dtype=torch.float32, device=device)

h_floor = h_idxs.to(torch.long)
w_floor = w_idxs.to(torch.long)
h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1)
w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1)

dh = h_idxs - h_floor
dw = w_idxs - w_floor

# Create meshgrid view for all h, w vars
dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing='ij')
h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing='ij')
h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing='ij')

# original computation of weights
# w00 = (1 - dh_grid) * (1 - dw_grid)
# w01 = (1 - dh_grid) * dw_grid
# w10 = dh_grid * (1 - dw_grid)
# w11 = dh_grid * dw_grid
# we reuse w11 here to avoid duplicate
# dh_grid * dw_grid computation
w11 = dh_grid * dw_grid
w10 = dh_grid - w11
w01 = dw_grid - w11
w00 = 1 - dh_grid - w01

h_grid = torch.stack([h_floor_grid, h_floor_grid, h_ceil_grid, h_ceil_grid])
w_grid = torch.stack([w_floor_grid, w_ceil_grid, w_floor_grid, w_ceil_grid])
h_grid_idx = h_grid * num_grid_per_side

indices = (h_grid_idx + w_grid).reshape(4, -1)
weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
weights = weights.to(dtype=self.pos_embed.weight.dtype, device=device)

embeds = self.pos_embed(indices)
embeds *= weights
combined = embeds.sum(dim=0)

combined = combined.reshape(h // m_size, m_size, w // m_size, m_size, hidden_dim)
combined = combined.permute(0, 2, 1, 3, 4).reshape(1, -1, hidden_dim)
repeated = combined.expand(t, -1, -1).reshape(-1, hidden_dim)
outputs.append(repeated)

return torch.cat(outputs, dim=0)

def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor,
pos_embeds: torch.Tensor) -> torch.Tensor:
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/turbomind/turbomind.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.

import asyncio
import base64
import copy
import json
import math
Expand All @@ -16,6 +15,7 @@
from typing import Any, Dict, List, Optional

import numpy as np
import pybase64
import torch
import yaml
from torch.nn.utils.rnn import pad_sequence
Expand Down Expand Up @@ -328,7 +328,7 @@ def _construct(item):

with torch.cuda.device(self.devices[0]):
if isinstance(request.serialized_named_tensors, str):
weights = ForkingPickler.loads(base64.b64decode(request.serialized_named_tensors))
weights = ForkingPickler.loads(pybase64.b64decode(request.serialized_named_tensors))
weights = {k: _construct(v) for k, v in weights}
else:
weights = request.serialized_named_tensors
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,10 +455,10 @@ def serialize_state_dict(state_dict: dict) -> str:
Returns:
str: serialized state dict.
"""
import base64
from io import BytesIO
from multiprocessing.reduction import ForkingPickler

import pybase64
from torch.multiprocessing.reductions import reduce_tensor

# flattened_tensor
Expand All @@ -472,7 +472,7 @@ def serialize_state_dict(state_dict: dict) -> str:
buf = BytesIO()
ForkingPickler(buf).dump(data)
buf.seek(0)
return base64.b64encode(buf.read()).decode('utf-8')
return pybase64.b64encode(buf.read()).decode('utf-8')


def is_dlblas_installed():
Expand Down
6 changes: 3 additions & 3 deletions lmdeploy/vl/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) OpenMMLab. All rights reserved.
import base64
import os
from io import BytesIO
from typing import Union

import pybase64
import requests
from PIL import Image, ImageFile

Expand Down Expand Up @@ -40,13 +40,13 @@ def encode_image_base64(image: Union[str, Image.Image]) -> str:
# use dummy image
image = Image.new('RGB', (32, 32))
image.save(buffered, format='PNG')
res = base64.b64encode(buffered.getvalue()).decode('utf-8')
res = pybase64.b64encode(buffered.getvalue()).decode('utf-8')
return res


def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
"""Load image from base64 format."""
return Image.open(BytesIO(base64.b64decode(image)))
return Image.open(BytesIO(pybase64.b64decode(image)))


def load_image(image_url: Union[str, Image.Image]) -> Image.Image:
Expand Down
1 change: 1 addition & 0 deletions requirements/runtime_ascend.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ partial_json_parser
peft<=0.11.1
pillow
protobuf
pybase64
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pydantic>2.0.0
pyzmq
ray
Expand Down
1 change: 1 addition & 0 deletions requirements/runtime_camb.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ partial_json_parser
peft<=0.11.1
pillow
protobuf
pybase64
pydantic>2.0.0
pyzmq
safetensors
Expand Down
1 change: 1 addition & 0 deletions requirements/runtime_cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ peft<=0.14.0
pillow
prometheus_client
protobuf
pybase64
pydantic>2.0.0
pyzmq
ray
Expand Down
1 change: 1 addition & 0 deletions requirements/runtime_maca.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ partial_json_parser
peft<=0.11.1
pillow
protobuf
pybase64
pydantic>2.0.0
pyzmq
safetensors
Expand Down
1 change: 1 addition & 0 deletions requirements/runtime_rocm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ partial_json_parser
peft<=0.14.0
pillow
protobuf
pybase64
pydantic>2.0.0
pyzmq
ray
Expand Down