From 464498217bcfc22c013536995d2a10d4f2c2c5bd Mon Sep 17 00:00:00 2001
From: ali asaria <aliasaria@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:56:07 -0400
Subject: [PATCH 1/9] handle value error if version can't be parsed

---
 fastchat/model/model_adapter.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 16cf5d2b6..74fe3e196 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -253,7 +253,12 @@ def load_model(
         kwargs = {"torch_dtype": torch.float16}
         import transformers
 
-        version = tuple(int(v) for v in transformers.__version__.split("."))
+        try:
+            version = tuple(int(v) for v in transformers.__version__.split("."))
+        except ValueError:
+            # some versions of transformers have a different version format (
+            # e.g. 4.50.0.dev0) and these break this parser so we set a default
+            version = (4, 36, 0)  
         if version < (4, 35, 0):
             # NOTE: Recent transformers library seems to fix the mps issue, also
             # it has made some changes causing compatibility issues with our

From 53ef276a613ca3074e61ac2ea65e77ae7d81b0a4 Mon Sep 17 00:00:00 2001
From: deep1401 <gandhi0869@gmail.com>
Date: Mon, 24 Mar 2025 14:36:08 -0400
Subject: [PATCH 2/9] Enable Fastchat serve to send top logprobs as well

---
 fastchat/serve/inference.py | 58 +++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index 6d155aab7..ad676ce46 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -83,6 +83,8 @@ def generate_stream(
     echo = bool(params.get("echo", True))
     stop_str = params.get("stop", None)
     stop_token_ids = params.get("stop_token_ids", None) or []
+    logprobs_requested = params.get("logprobs") is not None
+    top_logprobs_n = int(params.get("top_logprobs_n", 5) if logprobs_requested else 0)
     if tokenizer.eos_token_id not in stop_token_ids:
         stop_token_ids.append(tokenizer.eos_token_id)
 
@@ -116,9 +118,12 @@ def generate_stream(
 
     past_key_values = out = None
     token_logprobs = [None]  # The first token has no logprobs.
+    top_logprobs_list = [{}]  # The first token has no top logprobs.
     sent_interrupt = False
     finish_reason = None
     stopped = False
+    last_sent_token_pos = 0 
+
     for i in range(max_new_tokens):
         if i == 0:  # prefill
             if model.config.is_encoder_decoder:
@@ -142,6 +147,8 @@ def generate_stream(
                     shift_input_ids[0].tolist(), shift_logits[0]
                 ):
                     token_logprobs.append(logit[label_id])
+                    # Add empty top_logprobs during prefill (would need to reconstruct full logits tensor to get these)
+                    top_logprobs_list.append({})
         else:  # decoding
             if model.config.is_encoder_decoder:
                 out = model.decoder(
@@ -197,6 +204,28 @@ def generate_stream(
                 torch.log_softmax(logits[0, -1, :], dim=-1)[token].tolist()
             )
 
+        # Calculate top logprobs for the current token if needed
+        if logprobs_requested and top_logprobs_n > 0:
+            # Get raw logits for current position
+            current_logits = torch.log_softmax(logits[0, -1, :], dim=-1)
+            
+            # Get top tokens and their logprobs
+            topk_logits, topk_indices = torch.topk(current_logits, min(top_logprobs_n, len(current_logits)))
+            
+            # Create dictionary of token → logprob
+            top_dict = {}
+            for logit, token_id in zip(topk_logits.tolist(), topk_indices.tolist()):
+                token_text = tokenizer.decode([token_id])  # Use list to ensure proper decoding
+                if token_text and token_text.strip():  # Check if token is non-empty after stripping
+                    # If the same token appears with different logprobs, keep the highest one
+                    if token_text not in top_dict or logit > top_dict[token_text]:
+                        top_dict[token_text] = logit
+            
+            top_logprobs_list.append(top_dict)
+        else:
+            top_logprobs_list.append({})
+
+
         if token in stop_token_ids:
             stopped = True
         else:
@@ -219,21 +248,26 @@ def generate_stream(
             )
             ret_logprobs = None
             if logprobs is not None:
+                # Calculate the start position for this streaming chunk
+                if echo:
+                    start_pos = last_sent_token_pos
+                    tokens_to_send = output_ids[start_pos:]
+                else:
+                    start_pos = max(last_sent_token_pos, input_echo_len)
+                    tokens_to_send = output_ids[start_pos:]
+                
+                # Update last sent position for next stream chunk
+                last_sent_token_pos = len(output_ids)
+                
+                # Format response with only new tokens
                 ret_logprobs = {
                     "text_offset": [],
-                    "tokens": [
-                        tokenizer.decode(token)
-                        for token in (
-                            output_ids if echo else output_ids[input_echo_len:]
-                        )
-                    ],
-                    "token_logprobs": token_logprobs
-                    if echo
-                    else token_logprobs[input_echo_len:],
-                    "top_logprobs": [{}]
-                    * len(token_logprobs if echo else token_logprobs[input_echo_len:]),
+                    "tokens": [tokenizer.decode(token) for token in tokens_to_send],
+                    "token_logprobs": token_logprobs[start_pos:],
+                    "top_logprobs": top_logprobs_list[start_pos:],
                 }
-                # Compute text_offset
+                
+                # Compute text_offset for just this chunk
                 curr_pos = 0
                 for text in ret_logprobs["tokens"]:
                     ret_logprobs["text_offset"].append(curr_pos)

From db7e748e296f84e75eae0c61c5a14f3ddaa6485b Mon Sep 17 00:00:00 2001
From: deep1401 <gandhi0869@gmail.com>
Date: Thu, 27 Mar 2025 11:09:32 -0400
Subject: [PATCH 3/9] Add Gemma 3 support

---
 fastchat/model/model_adapter.py |  37 +++++++++
 fastchat/model/model_gemma3.py  | 132 ++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 fastchat/model/model_gemma3.py

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 74fe3e196..93f872a39 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -23,6 +23,7 @@
     LlamaTokenizer,
     LlamaForCausalLM,
     T5Tokenizer,
+    Gemma3ForCausalLM,
 )
 
 from fastchat.constants import CPU_ISA
@@ -36,6 +37,7 @@
 from fastchat.model.model_exllama import generate_stream_exllama
 from fastchat.model.model_xfastertransformer import generate_stream_xft
 from fastchat.model.model_cllm import generate_stream_cllm
+from fastchat.model.model_gemma3 import generate_stream_gemma3
 
 from fastchat.model.monkey_patch_non_inplace import (
     replace_llama_attn_with_non_inplace_operations,
@@ -419,6 +421,7 @@ def get_generate_stream_function(model: torch.nn.Module, model_path: str):
     is_xft = "xft" in model_type
     is_yuan = "yuan" in model_type
     is_cllm = "consistency-llm" in model_path.lower()
+    is_gemma3 = "gemma-3" in model_path.lower()
 
     if is_chatglm:
         return generate_stream_chatglm
@@ -434,6 +437,8 @@ def get_generate_stream_function(model: torch.nn.Module, model_path: str):
         return generate_stream_yuan2
     elif is_cllm:
         return generate_stream_cllm
+    elif is_gemma3:
+        return generate_stream_gemma3
 
     elif peft_share_base_weights and is_peft:
         # Return a curried stream function that loads the right adapter
@@ -458,6 +463,7 @@ def generate_stream_peft(
             is_xft = "xft" in base_model_type
             is_yuan = "yuan" in base_model_type
             is_cllm = "consistency-llm" in model_path.lower()
+            is_gemma3 = "gemma-3" in model_path.lower()
 
             generate_stream_function = generate_stream
             if is_chatglm:
@@ -474,6 +480,8 @@ def generate_stream_peft(
                 generate_stream_function = generate_stream_yuan2
             elif is_cllm:
                 generate_stream_function = generate_stream_cllm
+            elif is_gemma3:
+                generate_stream_function = generate_stream_gemma3
             for x in generate_stream_function(
                 model,
                 tokenizer,
@@ -822,6 +830,31 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         )
         return model, tokenizer
 
+class Gemma3Adapter(BaseModelAdapter):
+    """The model adapter for google/gemma-3"""
+
+    def match(self, model_path: str):
+        return "gemma-3" in model_path.lower()
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        device_map = from_pretrained_kwargs.get("device_map", None)
+        if device_map == "sequential":
+            device_map = "auto"
+        # print("From pretrained kwargs", from_pretrained_kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = Gemma3ForCausalLM.from_pretrained(
+                model_path,
+                revision=revision,
+                torch_dtype=torch.bfloat16,
+                device_map=device_map,
+            )
+        return model, tokenizer
+
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("gemma")
+
 
 class KoalaAdapter(BaseModelAdapter):
     """The model adapter for Koala"""
@@ -2505,8 +2538,12 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("api_based_default")
 
 
+    
+
+
 # Note: the registration order matters.
 # The one registered earlier has a higher matching priority.
+register_model_adapter(Gemma3Adapter)
 register_model_adapter(PeftModelAdapter)
 register_model_adapter(StableVicunaAdapter)
 register_model_adapter(VicunaAdapter)
diff --git a/fastchat/model/model_gemma3.py b/fastchat/model/model_gemma3.py
new file mode 100644
index 000000000..61d41379c
--- /dev/null
+++ b/fastchat/model/model_gemma3.py
@@ -0,0 +1,132 @@
+from threading import Thread
+import gc
+import torch
+from transformers import TextIteratorStreamer
+
+def generate_stream_gemma3(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len,
+    stream_interval=2,
+    judge_sent_end=False
+):
+    """Custom generate stream function for Gemma-3 models"""
+    # Get parameters from the request
+    prompt = params.get("prompt", "")
+    messages = params.get("messages", None)
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", -1))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    echo = bool(params.get("echo", True))
+    stop_str = params.get("stop", None)
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    model_name = params.get("model", None)
+
+    if tokenizer.eos_token_id not in stop_token_ids:
+        stop_token_ids.append(tokenizer.eos_token_id)
+
+    is_base_model = "pt" in model_name.lower() or "base" in model_name.lower()
+
+    if not is_base_model:
+        # Format input based on whether we have messages or a plain prompt
+        if messages:
+            inputs = tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+            ).to(model.device)
+        else:
+            messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+            inputs = tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+            ).to(model.device)
+    else:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+    input_ids = inputs["input_ids"]
+    input_echo_len = input_ids.shape[1]
+
+    # Configure generation parameters
+    generate_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "do_sample": temperature > 0.0,
+        "temperature": temperature if temperature > 0.0 else 1.0,
+    }
+
+    if top_p < 1.0:
+        generate_kwargs["top_p"] = top_p
+    if top_k > 0:
+        generate_kwargs["top_k"] = top_k
+    if repetition_penalty > 1.0:
+        generate_kwargs["repetition_penalty"] = repetition_penalty
+
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=not echo, skip_special_tokens=True)
+    generate_kwargs["streamer"] = streamer
+
+    # Start generation in a separate thread
+    thread = Thread(target=lambda: model.generate(input_ids=input_ids, **generate_kwargs))
+    thread.start()
+
+    # Track generation progress
+    generated_tokens = 0
+    output_text = ""
+
+    # Stream tokens
+    for new_text in streamer:
+        output_text += new_text
+        generated_tokens += 1
+
+        # Check for stop strings
+        should_stop = False
+        if stop_str:
+            if isinstance(stop_str, str):
+                if stop_str in output_text:
+                    output_text = output_text[: output_text.find(stop_str)]
+                    should_stop = True
+            elif isinstance(stop_str, list):
+                for stop in stop_str:
+                    if stop in output_text:
+                        output_text = output_text[: output_text.find(stop)]
+                        should_stop = True
+                        break
+
+        # Stream at intervals or when stopping
+        if generated_tokens % stream_interval == 0 or should_stop:
+            yield {
+                "text": output_text,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": generated_tokens,
+                    "total_tokens": input_echo_len + generated_tokens,
+                },
+                "finish_reason": "stop" if should_stop else None,
+            }
+
+        if should_stop:
+            break
+
+    # Final output with finish reason
+    if thread.is_alive():
+        thread.join(
+            timeout=3600
+        )  # Arbitrary value, but if it doesn't complete in this much time then something is wrong
+
+    yield {
+        "text": output_text,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": generated_tokens,
+            "total_tokens": input_echo_len + generated_tokens,
+        },
+        "finish_reason": "length",
+    }
+
+    # Clean up
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
\ No newline at end of file

From 5d628e28ba3655a5ea8ba82f2a113c18ac474987 Mon Sep 17 00:00:00 2001
From: deep1401 <gandhi0869@gmail.com>
Date: Fri, 25 Apr 2025 13:25:54 -0400
Subject: [PATCH 4/9] Changes to vLLM worker to work with custom_dtype

---
 fastchat/serve/vllm_worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 0af680bb5..a7cb72bb7 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -261,6 +261,7 @@ async def api_model_details(request: Request):
     parser.add_argument(
         "--conv-template", type=str, default=None, help="Conversation prompt template."
     )
+    parser.add_argument("--model_dtype", type=str, default="auto")
     parser.add_argument(
         "--trust_remote_code",
         action="store_false",
@@ -281,6 +282,8 @@ async def api_model_details(request: Request):
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
+    if args.model_dtype and args.model_dtype != "auto" and args.model_dtype.strip() != "":
+        args.dtype = args.model_dtype
     if args.model_path:
         args.model = args.model_path
     if args.num_gpus > 1:

From 6a9372fd0699692b874a5219fe260dbfa695b2ae Mon Sep 17 00:00:00 2001
From: deep1401 <gandhi0869@gmail.com>
Date: Wed, 30 Apr 2025 10:57:26 -0700
Subject: [PATCH 5/9] Change how controller log is assigned

---
 fastchat/serve/controller.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fastchat/serve/controller.py b/fastchat/serve/controller.py
index 42d928403..edbba39cb 100644
--- a/fastchat/serve/controller.py
+++ b/fastchat/serve/controller.py
@@ -28,7 +28,7 @@
 from fastchat.utils import build_logger
 
 
-logger = build_logger("controller", "controller.log")
+logger = None
 
 
 class DispatchMethod(Enum):
@@ -351,6 +351,7 @@ async def worker_api_get_status(request: Request):
 
 
 def create_controller():
+    global logger
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=21001)
@@ -367,7 +368,14 @@ def create_controller():
         default=False,
         help="Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.",
     )
+    parser.add_argument(
+        "--log-file",
+        type=str,
+        default="controller.log",
+        help="Path to the controller log file",
+    )
     args = parser.parse_args()
+    logger = build_logger("controller", args.log_file)
     logger.info(f"args: {args}")
 
     controller = Controller(args.dispatch_method)

From 05f6ac3a261399717d55c8febb6c925748998353 Mon Sep 17 00:00:00 2001
From: deep1401 <gandhi0869@gmail.com>
Date: Mon, 5 May 2025 11:36:17 -0700
Subject: [PATCH 6/9] Changes for publishing pypi package for tlab-inference

---
 README.md      | 26 +++++++++++++-----------
 pyproject.toml | 55 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 6510b8ab7..b5cf4cc4d 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,17 @@
-# FastChat
-| [**Demo**](https://lmarena.ai/) | [**Discord**](https://discord.gg/6GXcFg3TH8) | [**X**](https://x.com/lmsysorg) |
+# transformerlab-inference
+<!-- | [**Demo**](https://transformerlab.ai/) | [**Discord**](https://discord.gg/6GXcFg3TH8) | [**X**](https://x.com/lmsysorg) | -->
+
+> This repository started as a fork of [FastChat](https://github.com/lm-sys/FastChat) and is now a separate project being developed independently by the [TransformerLab](https://www.transformerlab.ai/) team. This repository is intended for the inference service of the [Transformer Lab project](https://github.com/transformerlab/transformerlab-app).
 
-FastChat is an open platform for training, serving, and evaluating large language model based chatbots.
-- FastChat powers Chatbot Arena ([lmarena.ai](https://lmarena.ai)), serving over 10 million chat requests for 70+ LLMs.
-- Chatbot Arena has collected over 1.5M human votes from side-by-side LLM battles to compile an online [LLM Elo leaderboard](https://lmarena.ai/?leaderboard).
 
+FastChat is an open platform for training, serving, and evaluating large language model based chatbots.
+<!-- - FastChat powers Chatbot Arena ([transformerlab.ai](https://transformerlab.ai)), serving over 10 million chat requests for 70+ LLMs.
+- Chatbot Arena has collected over 1.5M human votes from side-by-side LLM battles to compile an online [LLM Elo leaderboard](https://transformerlab.ai/?leaderboard). -->
 FastChat's core features include:
 - The training and evaluation code for state-of-the-art models (e.g., Vicuna, MT-Bench).
 - A distributed multi-model serving system with web UI and OpenAI-compatible RESTful APIs.
 
-## News
+<!-- ## News
 - [2024/03] 🔥 We released Chatbot Arena technical [report](https://arxiv.org/abs/2403.04132).
 - [2023/09] We released **LMSYS-Chat-1M**, a large-scale real-world LLM conversation dataset. Read the [report](https://arxiv.org/abs/2309.11998).
 - [2023/08] We released **Vicuna v1.5** based on Llama 2 with 4K and 16K context lengths. Download [weights](#vicuna-weights).
@@ -26,7 +28,7 @@ FastChat's core features include:
 
 </details>
 
-<a href="https://lmarena.ai"><img src="assets/demo_narrow.gif" width="70%"></a>
+<a href="https://transformerlab.ai"><img src="assets/demo_narrow.gif" width="70%"></a> -->
 
 ## Contents
 - [Install](#install)
@@ -48,10 +50,10 @@ pip3 install "fschat[model_worker,webui]"
 
 ### Method 2: From source
 
-1. Clone this repository and navigate to the FastChat folder
+1. Clone this repository and navigate to the transformerlab-inference folder
 ```bash
-git clone https://github.com/lm-sys/FastChat.git
-cd FastChat
+git clone https://github.com/transformerlab/transformerlab-inference.git
+cd transformerlab-inference
 ```
 
 If you are running on Mac:
@@ -97,7 +99,7 @@ You can use the commands below to chat with them. They will automatically downlo
 
 ## Inference with Command Line Interface
 
-<a href="https://lmarena.ai"><img src="assets/screenshot_cli.png" width="70%"></a>
+<a href="https://transformerlab.ai"><img src="assets/screenshot_cli.png" width="70%"></a>
 
 (Experimental Feature: You can specify `--style rich` to enable rich text output and better text streaming quality for some non-ASCII content. This may not work properly on certain terminals.)
 
@@ -202,7 +204,7 @@ export FASTCHAT_USE_MODELSCOPE=True
 
 ## Serving with Web GUI
 
-<a href="https://lmarena.ai"><img src="assets/screenshot_gui.png" width="70%"></a>
+<a href="https://transformerlab.ai"><img src="assets/screenshot_gui.png" width="70%"></a>
 
 To serve using the web UI, you need three main components: web servers that interface with users, model workers that host one or more models, and a controller to coordinate the webserver and model workers. You can learn more about the architecture [here](docs/server_arch.md).
 
diff --git a/pyproject.toml b/pyproject.toml
index 916aaeae0..d9012d969 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "fschat"
+name = "transformerlab-inference"
 version = "0.2.36"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
@@ -13,24 +13,61 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
-    "aiohttp", "fastapi", "httpx", "markdown2[all]", "nh3", "numpy",
-    "prompt_toolkit>=3.0.0", "pydantic<3,>=2.0.0", "pydantic-settings", "psutil", "requests", "rich>=10.0.0",
-    "shortuuid", "tiktoken", "uvicorn",
+    "aiohttp",
+    "fastapi",
+    "httpx",
+    "markdown2[all]",
+    "nh3",
+    "numpy",
+    "prompt_toolkit>=3.0.0",
+    "pydantic<3,>=2.0.0",
+    "pydantic-settings",
+    "psutil",
+    "requests",
+    "rich>=10.0.0",
+    "shortuuid",
+    "tiktoken",
+    "uvicorn",
 ]
 
 [project.optional-dependencies]
-model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf", "openai", "anthropic"]
+model_worker = [
+    "accelerate>=0.21",
+    "peft",
+    "sentencepiece",
+    "torch",
+    "transformers>=4.31.0",
+    "protobuf",
+    "openai",
+    "anthropic",
+]
 webui = ["gradio>=4.10", "plotly", "scipy"]
 train = ["einops", "flash-attn>=2.0", "wandb"]
 llm_judge = ["openai<1", "anthropic>=0.3", "ray"]
 dev = ["black==23.3.0", "pylint==2.8.2"]
 
 [project.urls]
-"Homepage" = "https://github.com/lm-sys/fastchat"
-"Bug Tracker" = "https://github.com/lm-sys/fastchat/issues"
+"Homepage" = "https://github.com/transformerlab/FastChat"
+"Bug Tracker" = "https://github.com/transformerlab/transformerlab-app/issues"
 
 [tool.setuptools.packages.find]
-exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+exclude = [
+    "assets*",
+    "benchmark*",
+    "docs",
+    "dist*",
+    "playground*",
+    "scripts*",
+    "tests*",
+]
 
 [tool.wheel]
-exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+exclude = [
+    "assets*",
+    "benchmark*",
+    "docs",
+    "dist*",
+    "playground*",
+    "scripts*",
+    "tests*",
+]

From 8590bc21a17cd1186436e4c05b2a2588e04987df Mon Sep 17 00:00:00 2001
From: deep1401 <gandhi0869@gmail.com>
Date: Mon, 5 May 2025 11:55:17 -0700
Subject: [PATCH 7/9] Typo fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b5cf4cc4d..905d7a25c 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # transformerlab-inference
 <!-- | [**Demo**](https://transformerlab.ai/) | [**Discord**](https://discord.gg/6GXcFg3TH8) | [**X**](https://x.com/lmsysorg) | -->
 
-> This repository started as a fork of [FastChat](https://github.com/lm-sys/FastChat) and is now a separate project being developed independently by the [TransformerLab](https://www.transformerlab.ai/) team. This repository is intended for the inference service of the [Transformer Lab project](https://github.com/transformerlab/transformerlab-app).
+> This repository started as a fork of [FastChat](https://github.com/lm-sys/FastChat) and is now a separate project being developed independently by the [Transformer Lab](https://www.transformerlab.ai/) team. This repository is intended for the inference service of the [Transformer Lab project](https://github.com/transformerlab/transformerlab-app).
 
 
 FastChat is an open platform for training, serving, and evaluating large language model based chatbots.

From 02a8cb23041c3ad5674eebb070453bdf0345367b Mon Sep 17 00:00:00 2001
From: Deep Gandhi <46297564+deep1401@users.noreply.github.com>
Date: Mon, 5 May 2025 13:02:56 -0700
Subject: [PATCH 8/9] Restore pyproject.toml

---
 pyproject.toml | 55 +++++++++-----------------------------------------
 1 file changed, 9 insertions(+), 46 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d9012d969..916aaeae0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "transformerlab-inference"
+name = "fschat"
 version = "0.2.36"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
@@ -13,61 +13,24 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
-    "aiohttp",
-    "fastapi",
-    "httpx",
-    "markdown2[all]",
-    "nh3",
-    "numpy",
-    "prompt_toolkit>=3.0.0",
-    "pydantic<3,>=2.0.0",
-    "pydantic-settings",
-    "psutil",
-    "requests",
-    "rich>=10.0.0",
-    "shortuuid",
-    "tiktoken",
-    "uvicorn",
+    "aiohttp", "fastapi", "httpx", "markdown2[all]", "nh3", "numpy",
+    "prompt_toolkit>=3.0.0", "pydantic<3,>=2.0.0", "pydantic-settings", "psutil", "requests", "rich>=10.0.0",
+    "shortuuid", "tiktoken", "uvicorn",
 ]
 
 [project.optional-dependencies]
-model_worker = [
-    "accelerate>=0.21",
-    "peft",
-    "sentencepiece",
-    "torch",
-    "transformers>=4.31.0",
-    "protobuf",
-    "openai",
-    "anthropic",
-]
+model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf", "openai", "anthropic"]
 webui = ["gradio>=4.10", "plotly", "scipy"]
 train = ["einops", "flash-attn>=2.0", "wandb"]
 llm_judge = ["openai<1", "anthropic>=0.3", "ray"]
 dev = ["black==23.3.0", "pylint==2.8.2"]
 
 [project.urls]
-"Homepage" = "https://github.com/transformerlab/FastChat"
-"Bug Tracker" = "https://github.com/transformerlab/transformerlab-app/issues"
+"Homepage" = "https://github.com/lm-sys/fastchat"
+"Bug Tracker" = "https://github.com/lm-sys/fastchat/issues"
 
 [tool.setuptools.packages.find]
-exclude = [
-    "assets*",
-    "benchmark*",
-    "docs",
-    "dist*",
-    "playground*",
-    "scripts*",
-    "tests*",
-]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
 
 [tool.wheel]
-exclude = [
-    "assets*",
-    "benchmark*",
-    "docs",
-    "dist*",
-    "playground*",
-    "scripts*",
-    "tests*",
-]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]

From 93a9c53acae18e45db425751eb49dd114806da76 Mon Sep 17 00:00:00 2001
From: Deep Gandhi <46297564+deep1401@users.noreply.github.com>
Date: Mon, 5 May 2025 13:06:29 -0700
Subject: [PATCH 9/9] Restore README.md

---
 README.md | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 905d7a25c..6510b8ab7 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,15 @@
-# transformerlab-inference
-<!-- | [**Demo**](https://transformerlab.ai/) | [**Discord**](https://discord.gg/6GXcFg3TH8) | [**X**](https://x.com/lmsysorg) | -->
-
-> This repository started as a fork of [FastChat](https://github.com/lm-sys/FastChat) and is now a separate project being developed independently by the [Transformer Lab](https://www.transformerlab.ai/) team. This repository is intended for the inference service of the [Transformer Lab project](https://github.com/transformerlab/transformerlab-app).
-
+# FastChat
+| [**Demo**](https://lmarena.ai/) | [**Discord**](https://discord.gg/6GXcFg3TH8) | [**X**](https://x.com/lmsysorg) |
 
 FastChat is an open platform for training, serving, and evaluating large language model based chatbots.
-<!-- - FastChat powers Chatbot Arena ([transformerlab.ai](https://transformerlab.ai)), serving over 10 million chat requests for 70+ LLMs.
-- Chatbot Arena has collected over 1.5M human votes from side-by-side LLM battles to compile an online [LLM Elo leaderboard](https://transformerlab.ai/?leaderboard). -->
+- FastChat powers Chatbot Arena ([lmarena.ai](https://lmarena.ai)), serving over 10 million chat requests for 70+ LLMs.
+- Chatbot Arena has collected over 1.5M human votes from side-by-side LLM battles to compile an online [LLM Elo leaderboard](https://lmarena.ai/?leaderboard).
+
 FastChat's core features include:
 - The training and evaluation code for state-of-the-art models (e.g., Vicuna, MT-Bench).
 - A distributed multi-model serving system with web UI and OpenAI-compatible RESTful APIs.
 
-<!-- ## News
+## News
 - [2024/03] 🔥 We released Chatbot Arena technical [report](https://arxiv.org/abs/2403.04132).
 - [2023/09] We released **LMSYS-Chat-1M**, a large-scale real-world LLM conversation dataset. Read the [report](https://arxiv.org/abs/2309.11998).
 - [2023/08] We released **Vicuna v1.5** based on Llama 2 with 4K and 16K context lengths. Download [weights](#vicuna-weights).
@@ -28,7 +26,7 @@ FastChat's core features include:
 
 </details>
 
-<a href="https://transformerlab.ai"><img src="assets/demo_narrow.gif" width="70%"></a> -->
+<a href="https://lmarena.ai"><img src="assets/demo_narrow.gif" width="70%"></a>
 
 ## Contents
 - [Install](#install)
@@ -50,10 +48,10 @@ pip3 install "fschat[model_worker,webui]"
 
 ### Method 2: From source
 
-1. Clone this repository and navigate to the transformerlab-inference folder
+1. Clone this repository and navigate to the FastChat folder
 ```bash
-git clone https://github.com/transformerlab/transformerlab-inference.git
-cd transformerlab-inference
+git clone https://github.com/lm-sys/FastChat.git
+cd FastChat
 ```
 
 If you are running on Mac:
@@ -99,7 +97,7 @@ You can use the commands below to chat with them. They will automatically downlo
 
 ## Inference with Command Line Interface
 
-<a href="https://transformerlab.ai"><img src="assets/screenshot_cli.png" width="70%"></a>
+<a href="https://lmarena.ai"><img src="assets/screenshot_cli.png" width="70%"></a>
 
 (Experimental Feature: You can specify `--style rich` to enable rich text output and better text streaming quality for some non-ASCII content. This may not work properly on certain terminals.)
 
@@ -204,7 +202,7 @@ export FASTCHAT_USE_MODELSCOPE=True
 
 ## Serving with Web GUI
 
-<a href="https://transformerlab.ai"><img src="assets/screenshot_gui.png" width="70%"></a>
+<a href="https://lmarena.ai"><img src="assets/screenshot_gui.png" width="70%"></a>
 
 To serve using the web UI, you need three main components: web servers that interface with users, model workers that host one or more models, and a controller to coordinate the webserver and model workers. You can learn more about the architecture [here](docs/server_arch.md).