diff --git a/transformers/llm/engine/tools/llm_bench.cpp b/transformers/llm/engine/tools/llm_bench.cpp index 029077f58..621762a51 100644 --- a/transformers/llm/engine/tools/llm_bench.cpp +++ b/transformers/llm/engine/tools/llm_bench.cpp @@ -154,6 +154,7 @@ struct TestInstance { bool useMmap; int nPrompt; int nGenerate; + std::vector nGenerates; std::vector prefillUs; std::vector decodeUs; std::vector samplesUs; @@ -184,6 +185,14 @@ struct TestInstance { return ts; } + std::vector getTokensPerSecond(std::vector n_tokens, std::vector cost_us) const { + std::vector ts(n_tokens.size()); + for (int i = 0; i < n_tokens.size(); ++i) { + ts[i] = 1e6 * n_tokens[i] / cost_us[i]; + } + return ts; + } + double getAvgUs(std::vector v) const { return ::avg(v); } double getStdevUs(std::vector v) const { return ::stdev(v); } enum fieldType { STRING, BOOL, INT, FLOAT }; @@ -354,7 +363,7 @@ struct markdownPrinter : public Printer { snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.getAvgUs(spd), t.getStdevUs(spd)); value = buf; } else if (field == "speed(tok/s)") { - auto decode_speed = t.getTokensPerSecond(t.nGenerate, t.decodeUs); + auto decode_speed = t.getTokensPerSecond(t.nGenerates, t.decodeUs); auto prefill_speed = t.getTokensPerSecond(t.nPrompt, t.prefillUs); snprintf(buf, sizeof(buf), "%.2f ± %.2f
%.2f ± %.2f", t.getAvgUs(prefill_speed), t.getStdevUs(prefill_speed), t.getAvgUs(decode_speed), t.getStdevUs(decode_speed)); value = buf; @@ -899,6 +908,11 @@ int main(int argc, char ** argv) { if (i > 0) { // Exclude the first performance value. t.prefillUs.push_back(prefillTime); t.decodeUs.push_back(decodeTime); + if (llm->stoped()) { + t.nGenerates.push_back(context->gen_seq_len - 1); + } else { + t.nGenerates.push_back(context->gen_seq_len); + } } } if (printHeader) {