Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/online_serving/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
| Token | `fastdeploy:prompt_tokens_total` | Counter | Total number of processed prompt tokens | count |
| Token | `fastdeploy:generation_tokens_total` | Counter | Total number of generated tokens | count |
| Token | `fastdeploy:request_prompt_tokens` | Histogram | Prompt token count per request | count |
| Token | `fastdeploy:request_token_ratio` | Histogram | Token generation rate per Request | count |
| Token | `fastdeploy:request_generation_tokens` | Histogram | Generation token count per request | count |
| Token | `fastdeploy:request_params_max_tokens` | Histogram | Distribution of `max_tokens` per request | count |
| Batch | `fastdeploy:available_batch_size` | Gauge | Number of additional requests that can be inserted during Decode | count |
Expand Down
1 change: 1 addition & 0 deletions docs/zh/online_serving/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
| Token | `fastdeploy:prompt_tokens_total` | Counter | 已处理的 prompt token 总数 | 个 |
| Token | `fastdeploy:generation_tokens_total` | Counter | 已生成的 token 总数 | 个 |
| Token | `fastdeploy:request_prompt_tokens` | Histogram | 每个请求的 prompt token 数量 | 个 |
| Token | `fastdeploy:request_token_ratio` | Histogram | 每个请求的token生成速率 | 个 |
| Token | `fastdeploy:request_generation_tokens` | Histogram | 每个请求的 generation token 数量 | 个 |
| Token | `fastdeploy:request_params_max_tokens` | Histogram | 请求的 max_tokens 分布 | 个 |
| 批处理 | `fastdeploy:available_batch_size` | Gauge | Decode 阶段还可以插入的请求数量 | 个 |
Expand Down
51 changes: 51 additions & 0 deletions fastdeploy/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ class MetricsManager:
request_params_max_tokens: "Histogram"
prompt_tokens_total: "Counter"
request_prompt_tokens: "Histogram"
request_token_ratio: "Histogram"

# 定义所有指标配置

Expand Down Expand Up @@ -559,6 +560,56 @@ class MetricsManager:
"description": "Number of prefill tokens processed",
"kwargs": {"buckets": build_1_2_5_buckets(33792)},
},
"request_token_ratio": {
"type": Histogram,
"name": "fastdeploy:request_token_ratio",
"description": "Ratio of output tokens to input tokens (generation_tokens / prompt_tokens)",
"kwargs": {
"buckets": [
0,
5,
10,
15,
20,
25,
30,
35,
40,
45,
50,
55,
60,
65,
70,
75,
80,
85,
90,
95,
100,
105,
110,
115,
120,
125,
130,
135,
140,
145,
150,
155,
160,
165,
170,
175,
180,
185,
190,
195,
200,
]
},
},
}

def __init__(self):
Expand Down
12 changes: 6 additions & 6 deletions fastdeploy/output/token_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,9 @@ def _process_per_token(self, task, batch_id: int, token_ids: np.ndarray, result:
llm_logger.info(
f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}."
)
llm_logger.info(
f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}"
)
token_ratio = self.tokens_counter[task_id] / (time.time() - task.inference_start_time)
llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
main_process_metrics.request_token_ratio.observe(token_ratio)
llm_logger.info(f"{self.resource_manager.info()}")
if self.cfg.speculative_config.method:
self._compute_speculative_status()
Expand Down Expand Up @@ -823,9 +823,9 @@ def _process_batch_output(self):
f"Request: {task_id} finished, number of "
f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}"
)
llm_logger.info(
f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}"
)
token_ratio = self.tokens_counter[task_id] / (time.time() - task.inference_start_time)
llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
main_process_metrics.request_token_ratio.observe(token_ratio)
llm_logger.info(f"{self.resource_manager.info()}")
if self.cfg.speculative_config.method:
self._compute_speculative_status(result)
Expand Down
Loading