Spaces:

Naphula
/

model_tools

Running

App Files Files Community

Naphula commited on 8 days ago

Commit

5f463e1

verified ·

1 Parent(s): 7080631

Upload 8 files

Browse files

Files changed (8) hide show

__init__.py +36 -0
donor_audit_v3.py +245 -0
eos_scanner.py +17 -1
llama.py +59 -0
model_tools.md +14 -1
moe_defs.py +197 -0
tokeninspector.py +135 -0
tokensurgeon.py +867 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import List
+from mergekit.moe.arch import MoEOutputArchitecture
+from mergekit.moe.deepseek import DeepseekMoE
+from mergekit.moe.mixtral import MixtralMoE
+ALL_OUTPUT_ARCHITECTURES: List[MoEOutputArchitecture] = [MixtralMoE(), DeepseekMoE()]
+try:
+    from mergekit.moe.qwen import QwenMoE
+except ImportError:
+    pass
+else:
+    ALL_OUTPUT_ARCHITECTURES.append(QwenMoE())
+try:
+    from mergekit.moe.qwen3 import Qwen3MoE
+except ImportError:
+    pass
+else:
+    ALL_OUTPUT_ARCHITECTURES.append(Qwen3MoE())
+# --- ADD THIS SECTION START ---
+try:
+    from mergekit.moe.llama import LlamaMoE
+except ImportError:
+    # This will trigger if llama.py is missing or has a syntax error
+    pass
+else:
+    ALL_OUTPUT_ARCHITECTURES.append(LlamaMoE())
+# --- ADD THIS SECTION END ---
+__all__ = [
+    "ALL_OUTPUT_ARCHITECTURES",
+    "MoEOutputArchitecture",
+]

donor_audit_v3.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright (C) 2025 Arcee AI & Kraken Architect
+# SPDX-License-Identifier: BUSL-1.1
+import logging
+import os
+import sys
+from typing import List, Optional
+import click
+import torch
+import yaml
+from tqdm import tqdm
+from mergekit.common import ModelReference
+from mergekit.config import MergeConfiguration
+from mergekit.io.lazy_tensor_loader import LazyTensorLoader, ShardedTensorIndex
+from mergekit.merge_methods.easy_define import merge_method
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOG = logging.getLogger("donor_audit")
+@merge_method(
+    name="donor_audit",
+    pretty_name="Donor Audit",
+    reference_url="https://arxiv.org/abs/2408.07990",
+)
+def _donor_audit_registration(tensors: List[torch.Tensor]) -> torch.Tensor:
+    """Placeholder to register the method name."""
+    return tensors[0]
+def rsce_weight(tvs: torch.Tensor) -> torch.Tensor:
+    """
+    Calculates matrix-level weights based on the energy of the task vectors.
+    (Copied from RSCE v3)
+    """
+    # Mean square energy
+    weights = torch.mean(tvs**2, dim=list(range(1, tvs.dim())))
+    weight_sum = torch.sum(weights).item()
+    if abs(weight_sum) < 1e-8:
+        return torch.ones_like(weights) / weights.shape[0]
+    return weights / weight_sum
+def log_rsce_audit(layer_name: str, weights: torch.Tensor, names: List[str]):
+    """Prints and saves a bar chart of donor influence."""
+    w_list = weights.tolist()
+    bar_char = "█"
+    # Header
+    print(f"\n{'='*60}")
+    print(f"RSCE DONOR AUDIT REPORT")
+    print(f"Target Tensor: {layer_name}")
+    print(f"{'='*60}")
+    lines = []
+    for name, w in zip(names, w_list):
+        pct = w * 100
+        # Scale bar: 50 chars = 100% influence (which is huge/impossible usually)
+        # Let's scale it so the max value fills the bar for better visibility
+        max_val = max(w_list) if max(w_list) > 0 else 1.0
+        # Relative bar length (relative to the loudest model)
+        bar_len = int((w / max_val) * 40)
+        bar = bar_char * bar_len
+        # Truncate name for clean display
+        clean_name = os.path.basename(name)
+        if len(clean_name) > 30:
+            clean_name = clean_name[:27] + "..."
+        lines.append(f"{clean_name:<30} | {bar:<40} | {pct:6.2f}% (Raw: {w:.4f})")
+    log_entry = "\n".join(lines)
+    print(log_entry)
+    print(f"{'='*60}\n")
+    # Append to file
+    with open("rsce_audit.log", "a", encoding="utf-8") as f:
+        f.write(f"\n[Audit {layer_name}]\n" + log_entry + "\n")
+def find_layer0_tensor(loader: LazyTensorLoader) -> str:
+    """
+    Scans a model loader to find a suitable Layer 0 tensor for auditing.
+    Prioritizes self_attn projections as they are usually dense and representative.
+    """
+    candidates = []
+    for key in loader.index.tensor_paths.keys():
+        # Look for Layer 0
+        if ".layers.0." in key or ".h.0." in key or ".blocks.0." in key:
+            # Look for weights (not bias)
+            if key.endswith(".weight"):
+                candidates.append(key)
+    # Priority sort: q_proj > gate_proj > dense > others
+    for c in candidates:
+        if "down_proj" in c: return c
+    for c in candidates:
+        if "gate_proj" in c: return c
+    for c in candidates:
+        if "c_attn" in c: return c # GPT-NeoX / Qwen
+    if not candidates:
+        raise RuntimeError("Could not find any Layer 0 weights in the base model.")
+    return candidates[0]
+def load_tensor_safe(model_path: str, tensor_name: str, device="cpu") -> torch.Tensor:
+    """Loads a single tensor from a model path."""
+    try:
+        # We use ShardedTensorIndex directly to avoid caching overhead of LoaderCache for this simple script
+        if os.path.isfile(model_path):
+            index = ShardedTensorIndex.from_file(model_path)
+        else:
+            index = ShardedTensorIndex.from_disk(model_path)
+        loader = LazyTensorLoader(index, lazy_unpickle=True)
+        # Handle potential naming mismatches (simple check)
+        if tensor_name not in index.tensor_paths:
+            # Try to find a fuzzy match if exact name fails (e.g. if models have slightly different archs)
+            # This is a basic fallback
+            suffix = tensor_name.split("layers.0.")[-1]
+            for k in index.tensor_paths.keys():
+                if k.endswith(suffix) and ("layers.0." in k or "h.0." in k):
+                    tensor_name = k
+                    break
+        t = loader.get_tensor(tensor_name, device=device)
+        return t.float() # Convert to float32 for math
+    except Exception as e:
+        LOG.error(f"Failed to load {tensor_name} from {model_path}: {e}")
+        sys.exit(1)
+@click.command()
+@click.argument("config_file", type=click.Path(exists=True))
+@click.option("--lora-merge-cache", default=None, help="Cache directory for merged LoRAs")
+@click.option("--cuda/--no-cuda", default=False, help="Use GPU for calculation (faster math, higher VRAM)")
+def main(config_file, lora_merge_cache, cuda):
+    """
+    RSCE Donor Audit Tool V3.
+    Loads Layer 0 from all models in the config and calculates their
+    Task Vector magnitude/energy contribution relative to the base model.
+    """
+    device = "cuda" if cuda and torch.cuda.is_available() else "cpu"
+    LOG.info(f"Running audit on {device}...")
+    # 1. Parse Config
+    with open(config_file, "r", encoding="utf-8") as f:
+        config_data = yaml.safe_load(f)
+        config = MergeConfiguration.model_validate(config_data)
+    # 2. Identify Models
+    base_model_ref = config.base_model
+    if not base_model_ref:
+        LOG.error("Config must specify a `base_model` for RSCE auditing.")
+        sys.exit(1)
+    # Extract donor models from slices or models list
+    donor_refs = []
+    if config.models:
+        donor_refs = [m.model for m in config.models]
+    elif config.slices:
+        # Flatten slices to get unique models
+        seen = set()
+        for s in config.slices:
+            for source in s.sources:
+                if source.model != base_model_ref and source.model not in seen:
+                    donor_refs.append(source.model)
+                    seen.add(source.model)
+    # Filter out base model if it appeared in donors
+    donor_refs = [d for d in donor_refs if d != base_model_ref]
+    LOG.info(f"Base Model: {base_model_ref.model.path}")
+    LOG.info(f"Found {len(donor_refs)} donor models.")
+    # 3. Resolve Paths (Handle LoRAs if necessary)
+    def resolve_path(ref: ModelReference):
+        if ref.lora:
+            if not lora_merge_cache:
+                LOG.warning("LoRA detected but --lora-merge-cache not set. This might fail.")
+            return ref.merged(cache_dir=lora_merge_cache).model.path
+        if not os.path.exists(ref.model.path):
+            try:
+                from huggingface_hub import snapshot_download
+                return snapshot_download(ref.model.path, allow_patterns=["*.safetensors", "*.bin", "*.json"])
+            except:
+                return ref.model.path
+        return ref.model.path
+    base_path = resolve_path(base_model_ref)
+    donor_paths = [resolve_path(d) for d in donor_refs]
+    # 4. Identify Target Tensor (Layer 0)
+    base_index = ShardedTensorIndex.from_disk(base_path)
+    base_loader = LazyTensorLoader(base_index, lazy_unpickle=True)
+    target_tensor_name = find_layer0_tensor(base_loader)
+    LOG.info(f"Selected audit tensor: {target_tensor_name}")
+    LOG.info("Loading tensors into memory...")
+    # 5. Load All Tensors
+    base_tensor = load_tensor_safe(base_path, target_tensor_name, device)
+    donor_tensors = []
+    valid_donor_refs = []
+    for d_path, d_ref in zip(tqdm(donor_paths, desc="Loading Donors"), donor_refs):
+        dt = load_tensor_safe(d_path, target_tensor_name, device)
+        # V3: Catch shape mismatches (e.g. a 7B model mixed into a 12B merge)
+        if dt.shape != base_tensor.shape:
+            LOG.warning(f"\n[!] Shape mismatch for {d_ref.model.path}: expected {base_tensor.shape}, got {dt.shape}. Skipping this model.")
+            continue
+        donor_tensors.append(dt)
+        valid_donor_refs.append(d_ref)
+    if not donor_tensors:
+        LOG.error("No valid donor tensors found with matching shapes. Exiting.")
+        sys.exit(1)
+    # 6. Perform RSCE Audit Math
+    LOG.info("Calculating Task Vector Energy...")
+    base_tv = torch.zeros_like(base_tensor)
+    donor_tvs = [dt - base_tensor for dt in donor_tensors]
+    all_tvs = torch.stack([base_tv] + donor_tvs, dim=0)
+    raw_weights = rsce_weight(all_tvs)
+    display_names = ["Base Model (Anchor)"] + [d.model.path for d in valid_donor_refs]
+    # 7. Output
+    log_rsce_audit(target_tensor_name, raw_weights, display_names)
+    LOG.info("Audit complete.")
+if __name__ == "__main__":
+    main()

eos_scanner.py CHANGED Viewed

@@ -31,9 +31,25 @@ def load_json(path):
         return None
 def get_model_metadata(model_path):
     data = {
         "path": model_path,
-        "name": os.path.basename(model_path).replace("!models--", ""),
         "gen_eos_id": "MISSING", # From generation_config.json
         "tok_eos_str": "MISSING", # From tokenizer_config.json
         "vocab_eos_id": "MISSING", # The actual ID of the string in tokenizer.json

         return None
 def get_model_metadata(model_path):
+    # --- NAME FIX LOGIC START ---
+    # Normalize path to handle trailing slashes or mixed separators
+    norm_path = os.path.normpath(model_path)
+    base_name = os.path.basename(norm_path)
+    # If the folder is named "fixed", grab the parent folder name instead
+    if base_name == "fixed":
+        parent_name = os.path.basename(os.path.dirname(norm_path))
+        display_name = f"{parent_name}/fixed"
+    else:
+        display_name = base_name
+    # Clean up the huggingface cache prefix
+    display_name = display_name.replace("!models--", "")
+    # --- NAME FIX LOGIC END ---
     data = {
         "path": model_path,
+        "name": display_name,
         "gen_eos_id": "MISSING", # From generation_config.json
         "tok_eos_str": "MISSING", # From tokenizer_config.json
         "vocab_eos_id": "MISSING", # The actual ID of the string in tokenizer.json

llama.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+import tqdm
+import transformers
+from mergekit.moe.arch import MoEOutputArchitecture
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
+from mergekit.moe.config import MoEMergeConfig
+from mergekit.options import MergeOptions
+from mergekit.architecture import arch_info_for_config
+class LlamaMoE(MoEOutputArchitecture):
+    def name(self) -> str:
+        return "LlamaMoE"
+    def supports_config(self, config: MoEMergeConfig, explain: bool = False, trust_remote_code: bool = False) -> bool:
+        # Ensure the base model is a Llama model
+        model_cfg = config.base_model.config(trust_remote_code=trust_remote_code)
+        if model_cfg.model_type != "llama":
+            if explain:
+                print("LlamaMoE only supports Llama base models")
+            return False
+        return True
+    def write_model(self, out_path: str, config: MoEMergeConfig, merge_options: MergeOptions, router_weights: list[torch.Tensor], shared_router_weights=None):
+        base_model = config.base_model
+        base_cfg = base_model.config(trust_remote_code=merge_options.trust_remote_code)
+        # 1. Generate the config.json
+        out_cfg = base_cfg.to_dict()
+        # Note: Most Llama MoEs use the Mixtral architecture name for compatibility with loaders
+        out_cfg["architectures"] = ["MixtralForCausalLM"]
+        out_cfg["num_local_experts"] = len(config.experts)
+        out_cfg["num_experts_per_tok"] = config.experts_per_token
+        out_dtype = select_dtype(config, base_cfg)
+        # 2. Initialize IO
+        loaders, base_loader, writer = initialize_io(config, out_path, merge_options)
+        # 3. Map Tensors
+        for weight_info in tqdm.tqdm(arch_info_for_config(base_cfg).all_weights(base_cfg), desc="Weights"):
+            tensor_name = weight_info.name
+            if ".mlp." in tensor_name:
+                for expert_idx, expert in enumerate(config.experts):
+                    # Map Llama's gate_proj/up_proj/down_proj to Mixtral's w1/w3/w2
+                    expert_name = tensor_name.replace(".mlp.gate_proj", f".block_sparse_moe.experts.{expert_idx}.w1")
+                    expert_name = expert_name.replace(".mlp.down_proj", f".block_sparse_moe.experts.{expert_idx}.w2")
+                    expert_name = expert_name.replace(".mlp.up_proj", f".block_sparse_moe.experts.{expert_idx}.w3")
+                    expert_loader = loaders.get(expert.source_model)
+                    copy_tensor_out(weight_info, expert_loader, writer, expert=expert, output_name=expert_name, out_dtype=out_dtype)
+            else:
+                # Copy Attention and Norms from base model
+                copy_tensor_out(weight_info, base_loader, writer, out_dtype=out_dtype)
+        # 4. Write Router Weights
+        for layer_idx, weight in enumerate(router_weights):
+            writer.save_tensor(f"model.layers.{layer_idx}.block_sparse_moe.gate.weight", weight.to(dtype=out_dtype))
+        writer.finalize()

model_tools.md CHANGED Viewed

@@ -32,8 +32,21 @@ Tools to enhance LLM quantizations and merging
 # [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
 - Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
 # [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
-- This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
 # [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
 - This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.

 # [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
 - Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
+# llama moe
+- Add support for Llama Mixture of Experts. If you want to merge custom Llama MoE you can add these scripts to your mergekit environment:
+- [mergekit-main\mergekit\architecture\moe_defs.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/moe_defs.py)
+- [mergekit-main\mergekit\__init__.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/__init__.py)
+- [mergekit-main\mergekit\moe\llama.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/llama.py)
+- Then assign the num_experts_per_tok in config.json (or the config.yaml)
+# [tokensurgeon.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokensurgeon.py)
+- Uses adaptive VRAM from Grim Jim's `measure.py` like `graph_v18` to prevent OOM. Use recommended [batch file](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fix_tokenizers.bat) here or modify sh. This supposedly avoids 'cardboard town' fake patches.
+# [tokeninspector.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokensurgeon.py)
+- Audit your tokensurgeon results.
 # [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
+- Updated! This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
 # [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
 - This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.

moe_defs.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (C) 2025 Arcee AI
+# SPDX-License-Identifier: LGPL-3.0-only
+from typing import ClassVar, List, Optional
+from pydantic import BaseModel
+from transformers import PretrainedConfig
+from mergekit.architecture.base import (
+    ModuleArchitecture,
+    WeightInfo,
+)
+from mergekit.architecture.json_definitions import NAME_TO_ARCH
+MISTRAL_INFO = NAME_TO_ARCH["MistralForCausalLM"][0]
+MISTRAL_MODULE_ARCH = MISTRAL_INFO.modules["default"].architecture
+class MixtralModuleArchitecture(ModuleArchitecture, BaseModel):
+    ARCHITECTURE_NAME: ClassVar[str] = "MixtralForCausalLM"
+    num_local_experts: int
+    def name(self) -> str:
+        return "mixtral"
+    @classmethod
+    def from_config(cls, config: PretrainedConfig):
+        return MixtralModuleArchitecture(num_local_experts=config.num_local_experts)
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return MISTRAL_MODULE_ARCH.pre_weights(config)
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return MISTRAL_MODULE_ARCH.post_weights(config)
+    def num_layers_config_key(self) -> str:
+        return MISTRAL_MODULE_ARCH.num_layers_config_key()
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        num_experts = self.num_local_experts
+        prefix = f"model.layers.{index}"
+        tensor_names = []
+        for expert_idx in range(num_experts):
+            for param in ("w1", "w2", "w3"):
+                tensor_names.append(
+                    prefix + f".block_sparse_moe.experts.{expert_idx}.{param}.weight"
+                )
+        tensor_names.append(prefix + ".block_sparse_moe.gate.weight")
+        res = []
+        for name in tensor_names:
+            res.append(WeightInfo(name=name))
+        for weight_info in MISTRAL_MODULE_ARCH.layer_weights(index, config):
+            if ".mlp." in weight_info.name:
+                continue
+            res.append(weight_info)
+        return res
+QWEN3_INFO = NAME_TO_ARCH["Qwen3ForCausalLM"][0]
+QWEN3_MODULE_ARCH = QWEN3_INFO.modules["default"].architecture
+class Qwen3MoeModuleArchitecture(ModuleArchitecture, BaseModel):
+    ARCHITECTURE_NAME: ClassVar[str] = "Qwen3MoeForCausalLM"
+    num_experts: int
+    def name(self) -> str:
+        return "qwen3_moe"
+    @classmethod
+    def from_config(cls, config: PretrainedConfig):
+        return Qwen3MoeModuleArchitecture(num_experts=config.num_experts)
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return QWEN3_MODULE_ARCH.pre_weights(config)
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return QWEN3_MODULE_ARCH.post_weights(config)
+    def num_layers_config_key(self) -> str:
+        return QWEN3_MODULE_ARCH.num_layers_config_key()
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        prefix = f"model.layers.{index}"
+        tensor_names = []
+        for expert_idx in range(self.num_experts):
+            for param in ("up_proj", "gate_proj", "down_proj"):
+                tensor_names.append(
+                    prefix + f".mlp.experts.{expert_idx}.{param}.weight"
+                )
+        tensor_names.append(prefix + ".mlp.gate.weight")
+        res = []
+        for name in tensor_names:
+            res.append(WeightInfo(name=name))
+        for weight_info in QWEN3_MODULE_ARCH.layer_weights(index, config):
+            if ".mlp." in weight_info.name:
+                continue
+            res.append(weight_info)
+        return res
+AFMOE_PARTIAL_INFO = NAME_TO_ARCH["_AfmoePartialForCausalLM"][0]
+AFMOE_PARTIAL_MODULE_ARCH = AFMOE_PARTIAL_INFO.modules["default"].architecture
+class AfmoeModuleArchitecture(ModuleArchitecture, BaseModel):
+    ARCHITECTURE_NAME: ClassVar[str] = "AfmoeForCausalLM"
+    num_experts: int
+    def name(self) -> str:
+        return "afmoe"
+    @classmethod
+    def from_config(cls, config: PretrainedConfig):
+        return AfmoeModuleArchitecture(num_experts=config.num_experts)
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return AFMOE_PARTIAL_MODULE_ARCH.pre_weights(config)
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return AFMOE_PARTIAL_MODULE_ARCH.post_weights(config)
+    def num_layers_config_key(self) -> str:
+        return AFMOE_PARTIAL_MODULE_ARCH.num_layers_config_key()
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        res = AFMOE_PARTIAL_MODULE_ARCH.layer_weights(index, config) or []
+        prefix = f"model.layers.{index}"
+        for expert_idx in range(self.num_experts):
+            for param in ("up_proj", "gate_proj", "down_proj"):
+                res.append(
+                    WeightInfo(
+                        name=prefix + f".mlp.experts.{expert_idx}.{param}.weight",
+                        optional=True,
+                    )
+                )
+        return res
+# Add this to moe_defs.py
+# 1. Get the base Llama info from the registry
+LLAMA_INFO = NAME_TO_ARCH["LlamaForCausalLM"][0]
+LLAMA_MODULE_ARCH = LLAMA_INFO.modules["default"].architecture
+class LlamaMoeModuleArchitecture(ModuleArchitecture, BaseModel):
+    # This is the name that will appear in the output config.json
+    ARCHITECTURE_NAME: ClassVar[str] = "LlamaMoeForCausalLM"
+    num_experts: int
+    def name(self) -> str:
+        return "llama_moe"
+    @classmethod
+    def from_config(cls, config: PretrainedConfig):
+        # This looks for the 'num_experts' key in the model's config
+        return LlamaMoeModuleArchitecture(num_experts=getattr(config, "num_experts", 8))
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        # Uses standard Llama embeddings/norms
+        return LLAMA_MODULE_ARCH.pre_weights(config)
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        # Uses standard Llama final norm/head
+        return LLAMA_MODULE_ARCH.post_weights(config)
+    def num_layers_config_key(self) -> str:
+        return LLAMA_MODULE_ARCH.num_layers_config_key()
+    def layer_weights(self, index: int, config: PretrainedConfig) -> Optional[List[WeightInfo]]:
+        prefix = f"model.layers.{index}"
+        res = []
+        # 2. Define the Expert weights
+        # We map the dense MLP layers into an expert array
+        for expert_idx in range(self.num_experts):
+            for param in ("gate_proj", "up_proj", "down_proj"):
+                res.append(
+                    WeightInfo(name=prefix + f".block_sparse_moe.experts.{expert_idx}.{param}.weight")
+                )
+        # 3. Define the Router (Gate) weight
+        res.append(WeightInfo(name=prefix + ".block_sparse_moe.gate.weight"))
+        # 4. Add the non-MLP weights (Attention layers, Input Norms)
+        # We skip the original .mlp. weights because we replaced them with experts
+        for weight_info in LLAMA_MODULE_ARCH.layer_weights(index, config):
+            if ".mlp." not in weight_info.name:
+                res.append(weight_info)
+        return res

tokeninspector.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# python tokeninspector.py "B:\12B\models--mistralai--Mistral-Nemo-Instruct-2407" "B:\12B\models--aixonlab--Aether-12b.backup" "B:\12B\models--aixonlab--Aether-12b"
+import os
+import click
+import torch
+import transformers
+from mergekit.io.lazy_tensor_loader import LazyTensorLoader
+def get_embed_tensor(model_path):
+    """Lazily loads the embedding tensor from a model directory."""
+    try:
+        loader = LazyTensorLoader.from_disk(model_path)
+        for key in loader.index.tensor_paths.keys():
+            if "embed_tokens.weight" in key or "wte.weight" in key:
+                return loader.get_tensor(key)
+    except Exception as e:
+        print(f"  [!] Error loading tensors from {model_path}: {e}")
+    return None
+@click.command()
+@click.argument("base_model", type=click.Path(exists=True))
+@click.argument("donor_model", type=click.Path(exists=True))
+@click.argument("output_model", type=click.Path(exists=True))
+def main(base_model, donor_model, output_model):
+    print("="*60)
+    print("🔍 TOKEN SURGEON AUDIT TOOL")
+    print("="*60)
+    print("\n[1] Loading Tokenizers...")
+    tok_base = transformers.AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+    tok_donor = transformers.AutoTokenizer.from_pretrained(donor_model, trust_remote_code=True)
+    tok_out = transformers.AutoTokenizer.from_pretrained(output_model, trust_remote_code=True)
+    print(f"  Base:   {len(tok_base)} tokens")
+    print(f"  Donor:  {len(tok_donor)} tokens")
+    print(f"  Output: {len(tok_out)} tokens")
+    if len(tok_out) != len(tok_donor):
+        print("  ❌ FAIL: Output vocab size does not match Donor vocab size!")
+    else:
+        print("  ✅ PASS: Output vocab size matches Donor.")
+    print("\n[2] Loading Embedding Tensors (Lazy Load)...")
+    emb_base = get_embed_tensor(base_model)
+    emb_donor = get_embed_tensor(donor_model)
+    emb_out = get_embed_tensor(output_model)
+    print(f"  Base Matrix:   {emb_base.shape if emb_base is not None else 'Not found'}")
+    print(f"  Donor Matrix:  {emb_donor.shape if emb_donor is not None else 'Not found'}")
+    print(f"  Output Matrix: {emb_out.shape if emb_out is not None else 'Not found'}")
+    if emb_out is not None and emb_donor is not None:
+        if emb_out.shape[0] >= len(tok_donor):
+            print("  ✅ PASS: Output embedding matrix size is sufficient for Donor vocab.")
+        else:
+            print("  ❌ FAIL: Output embedding matrix is smaller than Donor vocab!")
+    vocab_base = tok_base.get_vocab()
+    vocab_donor = tok_donor.get_vocab()
+    shared_tokens = set(vocab_base.keys()).intersection(set(vocab_donor.keys()))
+    donor_only_tokens = set(vocab_donor.keys()) - set(vocab_base.keys())
+    print("\n[3] Testing a Shared Token (Verifying exact transfer)...")
+    if shared_tokens:
+        # Pick a common word that is likely to exist in both
+        test_shared = None
+        for candidate in [" the", " hello", "The", "Hello", "Ġthe", "Ġhello", "the", "hello"]:
+            if candidate in shared_tokens:
+                test_shared = candidate
+                break
+        if not test_shared:
+            test_shared = list(shared_tokens)[len(shared_tokens)//2]
+        id_base = vocab_base[test_shared]
+        id_out = vocab_donor[test_shared] # output uses donor vocab
+        print(f"  Token: '{test_shared}'")
+        print(f"  ID in Base: {id_base} | ID in Output: {id_out}")
+        if emb_base is not None and emb_out is not None:
+            vec_base = emb_base[id_base].float()
+            vec_out = emb_out[id_out].float()
+            cos_sim = torch.nn.functional.cosine_similarity(vec_base, vec_out, dim=0).item()
+            print(f"  Cosine similarity between Base and Output vectors: {cos_sim:.6f}")
+            if cos_sim > 0.999:
+                print("  ✅ PASS: Embeddings match perfectly. The vector was successfully moved to the new ID.")
+            else:
+                print("  ❌ FAIL: Embeddings for shared token do not match!")
+    else:
+        print("  ⚠️ No shared tokens found between vocabularies.")
+    print("\n[4] Testing a New Token (Verifying OMP approximation)...")
+    if donor_only_tokens:
+        # Try to find a special token or a distinct word
+        test_new = list(donor_only_tokens)[0]
+        for t in donor_only_tokens:
+            if "<" in t or "[" in t or "im_start" in t:
+                test_new = t
+                break
+        id_out = vocab_donor[test_new]
+        print(f"  Token: '{test_new}' (Only exists in Donor)")
+        print(f"  ID in Output: {id_out}")
+        if emb_out is not None:
+            vec_out = emb_out[id_out].float()
+            norm = vec_out.norm().item()
+            print(f"  Vector L2 Norm: {norm:.4f}")
+            if norm > 0.01:
+                print("  ✅ PASS: Vector is non-zero. OMP successfully approximated a new embedding.")
+            else:
+                print("  ⚠️ WARN: Vector is zero or very close to zero. It may have been treated as a junk token.")
+    else:
+        print("  ⚠️ No donor-only tokens found. Vocabularies are identical.")
+    print("\n[5] Testing Tokenizer Encoding Behavior...")
+    test_text = "Hello world! This is a test of the new tokenizer. <|im_start|>system\n12345<|im_end|>"
+    enc_donor = tok_donor.encode(test_text)
+    enc_out = tok_out.encode(test_text)
+    if enc_donor == enc_out:
+        print("  ✅ PASS: Output model encodes text exactly identically to the Donor model.")
+    else:
+        print("  ❌ FAIL: Output model encoding differs from Donor model!")
+        print(f"     Donor:  {enc_donor[:10]}...")
+        print(f"     Output: {enc_out[:10]}...")
+    print("\n" + "="*60)
+    print("Audit Complete.")
+    print("="*60)
+if __name__ == '__main__':
+    main()

tokensurgeon.py ADDED Viewed

	@@ -0,0 +1,867 @@

+# Copyright (C) 2025 Arcee AI
+# SPDX-License-Identifier: LGPL-3.0-only
+import enum
+import logging
+from typing import Dict, List, Optional, Tuple
+import click
+import torch
+import torch.distributions.constraints
+import tqdm
+import transformers
+from pydantic import BaseModel
+from mergekit.architecture import (
+    ConfiguredModelArchitecture,
+    WeightInfo,
+    arch_info_for_config,
+)
+from mergekit.common import ModelReference, set_config_value
+from mergekit.io.tasks import (
+    LoaderCache,
+)
+from mergekit.io.tensor_writer import TensorWriter
+from mergekit.options import MergeOptions, PrettyPrintHelp, add_merge_options
+from mergekit.tokenizer.normalization import (
+    NormalizedToken,
+    normalized_vocabulary,
+    token_prefixes,
+)
+from mergekit.tokensurgeon import (
+    SubwordMethod,
+    WeightingScheme,
+    batch_mp_rope,
+    batch_omp,
+    common_interp_approximate,
+    compute_token_basis,
+    landmark_pca_approximate,
+    subword_approximate,
+    well_trained_tokens,
+)
+from mergekit.tokensurgeon.common_interpolation import DistanceMetric
+LOG = logging.getLogger(__name__)
+class TokenAssignmentStats(BaseModel):
+    exact_match: int = 0
+    byte_match: int = 0
+    prefix_match: int = 0
+    to_approximate: int = 0
+    def pretty_print(self):
+        chunks = ["Token Breakdown:"]
+        if self.exact_match:
+            chunks.append(f"  Exact matches: {self.exact_match}")
+        if self.byte_match:
+            chunks.append(f"  Byte matches: {self.byte_match}")
+        if self.prefix_match:
+            chunks.append(f"  Prefix matches: {self.prefix_match}")
+        if self.to_approximate:
+            chunks.append(f"  Tokens to approximate: {self.to_approximate}")
+        chunks.append(
+            f"  Total: {self.exact_match + self.byte_match + self.prefix_match + self.to_approximate}"
+        )
+        return "\n".join(chunks)
+class ApproximationMethod(enum.Enum):
+    COMMON_INTERPOLATION = "common_interpolation"
+    SUBWORD = "subword"
+    MEAN = "mean"
+    ZERO = "zero"
+    RANDN = "randn"
+    JOHN_HEWITT = "john_hewitt"
+    ORTHOGONAL_MATCHING_PURSUIT = "omp"
+    LANDMARK_PCA = "landmark_pca"
+    SPARSE_TOKEN_BASIS = "stb"
+    MATCHING_PURSUIT_ROPE = "mp_rope"
+class TokenSurgeonOptions(BaseModel):
+    model: ModelReference
+    donor: ModelReference
+    out_path: str
+    method: ApproximationMethod = ApproximationMethod.COMMON_INTERPOLATION
+    weight_scheme: WeightingScheme = WeightingScheme.DISTANCE_PROPORTIONAL
+    k: int = 64
+    cosine_similarity: bool = False
+    subword_method: SubwordMethod = SubwordMethod.MEAN
+    batch_size: Optional[int] = None
+    new_vocab_noise: Optional[float] = None
+    new_vocab_scale: Optional[float] = None
+def get_arch_info(
+    model: ModelReference, options: MergeOptions
+) -> ConfiguredModelArchitecture:
+    cfg = model.config(trust_remote_code=options.trust_remote_code)
+    arch_info = arch_info_for_config(cfg)
+    return ConfiguredModelArchitecture(info=arch_info, config=cfg)
+def get_embedding_info(
+    arch_info: ConfiguredModelArchitecture,
+) -> Tuple[WeightInfo, WeightInfo]:
+    """Get WeightInfo for the input and output embeddings of a model."""
+    if len(arch_info.info.modules) != 1:
+        raise RuntimeError("Model has multiple modules - not supported by tokensurgeon")
+    name = next(iter(arch_info.info.modules.keys()))
+    module_def = arch_info.get_module(name)
+    embed, lm_head = None, None
+    for weight_info in module_def.pre_weights():
+        if weight_info.is_embed:
+            if embed is not None:
+                raise RuntimeError("Multiple input embeddings found")
+            embed = weight_info
+    for weight_info in module_def.post_weights():
+        if weight_info.is_embed:
+            if lm_head is not None:
+                raise RuntimeError("Multiple output embeddings found")
+            lm_head = weight_info
+    return embed, lm_head
+def maybe_aliases(weight_info: WeightInfo, tied: bool) -> Tuple[str, ...]:
+    return tuple(
+        list(weight_info.aliases or [])
+        + list((weight_info.tied_names or []) if tied else [])
+    )
+def get_stuff(
+    model: ModelReference,
+    options: MergeOptions,
+    arch_info: Optional[ConfiguredModelArchitecture] = None,
+    get_tied: bool = False,
+    device: str = "cpu",
+) -> Tuple[Dict[NormalizedToken, int], Optional[torch.Tensor], Optional[torch.Tensor]]:
+    if arch_info is None:
+        arch_info = get_arch_info(model, options)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model.model.path,
+        revision=model.model.revision,
+        trust_remote_code=options.trust_remote_code,
+    )
+    vocab = normalized_vocabulary(tokenizer)
+    embed_wi, lm_head_wi = get_embedding_info(arch_info)
+    loader = LoaderCache().get(model)
+    embed = loader.get_tensor(
+        embed_wi.name,
+        device=device,
+        aliases=maybe_aliases(embed_wi, get_tied),
+        raise_on_missing=not embed_wi.optional,
+    )
+    lm_head = loader.get_tensor(
+        lm_head_wi.name,
+        device=device,
+        aliases=maybe_aliases(lm_head_wi, get_tied),
+        raise_on_missing=not lm_head_wi.optional,
+    )
+    return vocab, embed, lm_head
+def match_byte_token(
+    token: NormalizedToken, original_vocab: Dict[NormalizedToken, int]
+) -> Optional[int]:
+    if not isinstance(token, str):
+        return None
+    if len(token) == 1 and ord(token) < 256:
+        # check for matching byte tokens
+        byte_tok = f"<0x{ord(token):02X}>"
+        if byte_tok in original_vocab:
+            return original_vocab[byte_tok]
+    elif token.startswith("<0x") and token.endswith(">") and len(token) == 6:
+        # check for character tokens matching byte tokens
+        try:
+            byte = int(token[3:-1], 16)
+        except ValueError:
+            pass
+        else:
+            if chr(byte) in original_vocab:
+                return original_vocab[chr(byte)]
+    return None
+def match_prefix(
+    token: NormalizedToken, original_vocab: Dict[NormalizedToken, int]
+) -> Optional[int]:
+    for prefix in token_prefixes(token):
+        if prefix in original_vocab:
+            return original_vocab[prefix]
+    return None
+def get_out_arch_info(
+    model: ModelReference,
+    donor: ModelReference,
+    new_vocab_size: int,
+    common_options: MergeOptions,
+) -> ConfiguredModelArchitecture:
+    cfg_donor = donor.config(trust_remote_code=common_options.trust_remote_code)
+    cfg_out = model.config(trust_remote_code=common_options.trust_remote_code)
+    arch_info_out = arch_info_for_config(cfg_out)
+    set_config_value(
+        cfg_out, arch_info_out.vocab_size_config_key or "vocab_size", new_vocab_size
+    )
+    for key in [
+        "pad_token_id",
+        "eos_token_id",
+        "bos_token_id",
+        "unk_token_id",
+        "mask_token_id",
+        "padding_side",
+    ]:
+        if hasattr(cfg_donor, key):
+            set_config_value(cfg_out, key, getattr(cfg_donor, key))
+    return ConfiguredModelArchitecture(info=arch_info_out, config=cfg_out)
+def john_hewitt_init(orig_embed: torch.Tensor, num_new_tokens: int) -> torch.Tensor:
+    orig_embed_f32 = orig_embed.to(torch.float32)
+    mean = orig_embed_f32.mean(dim=0)
+    centered = orig_embed_f32 - mean
+    covariance = centered.T @ centered / orig_embed_f32.shape[0]
+    is_pd = torch.distributions.constraints.positive_definite.check(covariance).all()
+    if not is_pd:
+        LOG.warning(
+            "Covariance matrix is not positive definite - falling back to small randn"
+        )
+        return (
+            torch.randn(
+                len(num_new_tokens),
+                orig_embed.shape[1],
+                device=orig_embed.device,
+                dtype=orig_embed.dtype,
+            )
+            * 0.02
+        )
+    dist = torch.distributions.multivariate_normal.MultivariateNormal(
+        loc=mean,
+        covariance_matrix=covariance,
+    )
+    new_embeds = dist.sample((num_new_tokens,))
+    return new_embeds.to(orig_embed.dtype)
+def compute_new_embeddings(
+    orig_embed: torch.Tensor,
+    donor_embed: torch.Tensor,
+    orig_vocab: Dict[NormalizedToken, int],
+    donor_vocab: Dict[NormalizedToken, int],
+    target_tokens: List[NormalizedToken],
+    is_lm_head: bool,
+    token_basis: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    orig_tokenizer: transformers.PreTrainedTokenizerBase,
+    options: TokenSurgeonOptions,
+    shared_data: Optional[Dict] = None,
+    compute_device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    assert all(t in donor_vocab for t in target_tokens)
+    if options.method == ApproximationMethod.MEAN:
+        mean = orig_embed.mean(dim=0).to(compute_device)
+        return mean.unsqueeze(0).expand(len(target_tokens), -1)
+    elif options.method == ApproximationMethod.ZERO:
+        return torch.zeros(
+            len(target_tokens),
+            orig_embed.shape[1],
+            device=compute_device,
+            dtype=orig_embed.dtype,
+        )
+    elif options.method == ApproximationMethod.RANDN:
+        return torch.randn(
+            len(target_tokens),
+            orig_embed.shape[1],
+            device=compute_device,
+            dtype=orig_embed.dtype,
+        )
+    elif options.method == ApproximationMethod.JOHN_HEWITT:
+        return john_hewitt_init(orig_embed.to(compute_device), len(target_tokens))
+    elif options.method in (
+        ApproximationMethod.COMMON_INTERPOLATION,
+        ApproximationMethod.ORTHOGONAL_MATCHING_PURSUIT,
+        ApproximationMethod.LANDMARK_PCA,
+        ApproximationMethod.MATCHING_PURSUIT_ROPE,
+    ):
+        if shared_data is not None:
+            donor_shared_embeds = shared_data["donor_shared_embeds"].to(compute_device)
+            orig_shared_embeds = shared_data["orig_shared_embeds"].to(compute_device)
+        else:
+            shared_vocab = list(
+                sorted(
+                    set(orig_vocab.keys()) & set(donor_vocab.keys()),
+                    key=lambda x: donor_vocab[x],
+                )
+            )
+            donor_shared_embeds = donor_embed[
+                torch.tensor([donor_vocab[t] for t in shared_vocab])
+            ].to(compute_device)
+            orig_shared_embeds = orig_embed[
+                torch.tensor([orig_vocab[t] for t in shared_vocab])
+            ].to(compute_device)
+        res = None
+        in_donor = None
+        targets = donor_embed[torch.tensor([donor_vocab[t] for t in target_tokens])].to(compute_device)
+        if options.method == ApproximationMethod.LANDMARK_PCA:
+            return landmark_pca_approximate(
+                targets,
+                donor_shared_embeds,
+                orig_shared_embeds,
+            )
+        elif options.method == ApproximationMethod.COMMON_INTERPOLATION:
+            indices, coeffs = common_interp_approximate(
+                targets,
+                donor_shared_embeds,
+                k=options.k,
+                metric=(
+                    DistanceMetric.COSINE
+                    if options.cosine_similarity
+                    else DistanceMetric.EUCLIDEAN
+                ),
+                weight_scheme=options.weight_scheme,
+            )
+        elif options.method == ApproximationMethod.MATCHING_PURSUIT_ROPE:
+            model_config = options.model.config(trust_remote_code=False)
+            donor_config = options.donor.config(trust_remote_code=False)
+            indices, coeffs, res, in_donor = batch_mp_rope(
+                targets,
+                donor_shared_embeds,
+                orig_shared_embeds,
+                k=options.k,
+                num_heads_a=donor_config.num_attention_heads,
+                num_heads_b=model_config.num_attention_heads,
+                a_rope_base=donor_config.rope_theta,
+                b_rope_base=model_config.rope_theta,
+            )
+        else:
+            indices, coeffs = batch_omp(targets, donor_shared_embeds, options.k)
+        if res is None:
+            res = (
+                torch.bmm(
+                    coeffs.unsqueeze(1), orig_shared_embeds[indices].to(torch.float)
+                )
+                .squeeze(1)
+                .to(orig_embed.dtype)
+            )
+        return res
+    elif options.method == ApproximationMethod.SUBWORD:
+        return subword_approximate(
+            orig_embed.to(compute_device),
+            target_tokens,
+            is_lm_head,
+            orig_tokenizer,
+            options.subword_method,
+        )
+    elif options.method == ApproximationMethod.SPARSE_TOKEN_BASIS:
+        assert token_basis is not None, "Token basis must be provided for STB"
+        donor_basis, orig_basis = token_basis
+        donor_basis = donor_basis.to(compute_device).to(torch.float32)
+        orig_basis = orig_basis.to(compute_device).to(torch.float32)
+        target_donor_embeds = donor_embed[
+            torch.tensor([donor_vocab[t] for t in target_tokens])
+        ].to(compute_device).to(torch.float32) - donor_embed.mean(dim=0).to(compute_device)
+        coeffs = torch.linalg.lstsq(
+            donor_basis.T,
+            target_donor_embeds.T,
+        ).solution.T
+        if LOG.isEnabledFor(logging.DEBUG):
+            donor_rt = coeffs @ donor_basis
+            err = (donor_rt - target_donor_embeds).norm(dim=1)
+            err_rel = err / target_donor_embeds.norm(dim=1).clamp_min(1e-6)
+            sim = torch.nn.functional.cosine_similarity(
+                donor_rt, target_donor_embeds, dim=1
+            )
+            LOG.debug(f"Reconstruction error: {err.mean().item():.4f}")
+            LOG.debug(f"Relative reconstruction error: {err_rel.mean().item():.4f}")
+            LOG.debug(f"Cosine similarity: {sim.mean().item():.4f}")
+        return coeffs @ orig_basis + orig_embed.mean(dim=0).to(compute_device)
+    else:
+        raise ValueError(f"Unknown approximation method: {options.method}")
+def build_embedding_matrix(
+    weight_info: WeightInfo,
+    orig_embed: torch.Tensor,
+    donor_embed: torch.Tensor,
+    orig_vocab: Dict[NormalizedToken, int],
+    donor_vocab: Dict[NormalizedToken, int],
+    junk_tokens: List[int],
+    allow_prefix: bool,
+    allow_byte: bool,
+    is_lm_head: bool,
+    options: TokenSurgeonOptions,
+    compute_device: torch.device,
+) -> torch.Tensor:
+    LOG.info(f"Building new tensor for {weight_info.name}")
+    stats = TokenAssignmentStats()
+    out_vocab_size = max(len(donor_vocab), max(donor_vocab.values()) + 1)
+    if options.method == ApproximationMethod.SPARSE_TOKEN_BASIS:
+        token_basis = compute_token_basis(
+            orig_embed,
+            donor_embed,
+            orig_vocab,
+            donor_vocab,
+            junk_tokens,
+            options,
+        )
+    else:
+        token_basis = None
+    res = torch.zeros(
+        out_vocab_size,
+        orig_embed.shape[1],
+        device=orig_embed.device,
+        dtype=orig_embed.dtype,
+    )
+    new_tokens = []
+    for token, donor_idx in donor_vocab.items():
+        if token in orig_vocab:
+            orig_idx = orig_vocab[token]
+            res[donor_idx] = orig_embed[orig_idx]
+            stats.exact_match += 1
+        elif (
+            allow_byte and (orig_idx := match_byte_token(token, orig_vocab)) is not None
+        ):
+            res[donor_idx] = orig_embed[orig_idx]
+            stats.byte_match += 1
+        elif allow_prefix and (orig_idx := match_prefix(token, orig_vocab)) is not None:
+            res[donor_idx] = orig_embed[orig_idx]
+            stats.prefix_match += 1
+        else:
+            new_tokens.append(token)
+            stats.to_approximate += 1
+    donor_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        options.donor.model.path,
+        revision=options.donor.model.revision,
+        trust_remote_code=True,
+    )
+    orig_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        options.model.model.path,
+        revision=options.model.model.revision,
+        trust_remote_code=True,
+    )
+    LOG.info(stats.pretty_print())
+    if new_tokens:
+        LOG.info(f"Approximating {len(new_tokens)} tokens")
+        # Precompute shared embeds to avoid doing it in every batch
+        shared_vocab = list(
+            sorted(
+                set(orig_vocab.keys()) & set(donor_vocab.keys()),
+                key=lambda x: donor_vocab[x],
+            )
+        )
+        donor_shared_embeds = donor_embed[
+            torch.tensor([donor_vocab[t] for t in shared_vocab])
+        ]
+        orig_shared_embeds = orig_embed[
+            torch.tensor([orig_vocab[t] for t in shared_vocab])
+        ]
+        shared_data = {
+            "donor_shared_embeds": donor_shared_embeds,
+            "orig_shared_embeds": orig_shared_embeds,
+        }
+        batch_size = options.batch_size
+        if batch_size is None or batch_size <= 0:
+            batch_size = 512
+        # Adaptive batching logic
+        i = 0
+        total_tokens = len(new_tokens)
+        oom_count = 0
+        pbar = tqdm.tqdm(total=total_tokens, desc="Approximating tokens")
+        while i < total_tokens:
+            end = min(i + batch_size, total_tokens)
+            current_batch = new_tokens[i:end]
+            try:
+                new_embeds = compute_new_embeddings(
+                    orig_embed,
+                    donor_embed,
+                    orig_vocab,
+                    donor_vocab,
+                    target_tokens=current_batch,
+                    is_lm_head=is_lm_head,
+                    token_basis=token_basis,
+                    orig_tokenizer=orig_tokenizer,
+                    options=options,
+                    shared_data=shared_data,
+                    compute_device=compute_device,
+                )
+                if options.new_vocab_noise:
+                    new_embeds += torch.randn_like(new_embeds) * options.new_vocab_noise
+                if options.new_vocab_scale:
+                    new_embeds *= options.new_vocab_scale
+                for ne_idx, token in enumerate(current_batch):
+                    res[donor_vocab[token]] = new_embeds[ne_idx].to(res.device)
+                # Success, move to next batch
+                pbar.update(end - i)
+                i = end
+                oom_count = 0
+                # Optional cleanup
+                if compute_device.type == "cuda":
+                    torch.cuda.empty_cache()
+            except torch.OutOfMemoryError:
+                oom_count += 1
+                if compute_device.type == "cuda":
+                    torch.cuda.empty_cache()
+                import gc
+                gc.collect()
+                old_batch = batch_size
+                batch_size = max(1, int(batch_size * 0.75))
+                if batch_size == old_batch and batch_size == 1:
+                    LOG.error("OOM even with batch size 1. Cannot continue.")
+                    raise
+                LOG.warning(f"OOM error. Reducing batch size from {old_batch} to {batch_size} (attempt {oom_count})")
+                if oom_count > 10:
+                    LOG.error("Too many OOM errors, giving up.")
+                    raise
+        pbar.close()
+    if junk_tokens:
+        LOG.info(f"Zero-initializing {len(junk_tokens)} junk tokens")
+        for token_id in junk_tokens:
+            res[token_id] = torch.zeros(
+                orig_embed.shape[1],
+                device=orig_embed.device,
+                dtype=orig_embed.dtype,
+            )
+    return res
+class AllowMatch(enum.Enum):
+    LM_HEAD_ONLY = "lm_head"
+    EMBED_ONLY = "embed"
+    YES = "yes"
+    NO = "no"
+@click.command("mergekit-tokensurgeon", cls=PrettyPrintHelp)
+@click.argument("model", type=str)
+@click.argument("donor", type=str)
+@click.argument("out_path", type=str)
+@click.option(
+    "--k",
+    "-k",
+    type=int,
+    default=64,
+    help="Number of nearest neighbours to use for embedding interpolation",
+    show_default=True,
+)
+@click.option(
+    "--cosine-similarity/--no-cosine-similarity",
+    "-c/-nc",
+    is_flag=True,
+    default=False,
+    help="Use cosine similarity for nearest neighbour search",
+    show_default=True,
+)
+@click.option(
+    "--approximation-method",
+    "-a",
+    type=click.Choice([m.value for m in ApproximationMethod]),
+    default=ApproximationMethod.ORTHOGONAL_MATCHING_PURSUIT.value,
+    help="Method for approximating missing tokens",
+    show_default=True,
+)
+@click.option(
+    "--weight-scheme",
+    "-w",
+    type=click.Choice([w.value for w in WeightingScheme]),
+    default=WeightingScheme.DISTANCE_PROPORTIONAL.value,
+    help="Weighting scheme for common-vocabulary interpolation",
+    show_default=True,
+)
+@click.option(
+    "--subword-method",
+    "-s",
+    type=click.Choice([m.value for m in SubwordMethod]),
+    default=SubwordMethod.MEAN.value,
+    help="Method for approximating embeddings with subword tokens",
+    show_default=True,
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+    help="Number of tokens to process in each batch. -1 for no batching.",
+    show_default=True,
+)
+@click.option(
+    "--prefix-match",
+    "-pm",
+    type=click.Choice([m.value for m in AllowMatch]),
+    default=AllowMatch.NO.value,
+    help="Allow prefix match for tokens",
+    show_default=True,
+)
+@click.option(
+    "--byte-match",
+    "-bm",
+    type=click.Choice([m.value for m in AllowMatch]),
+    default=AllowMatch.NO.value,
+    help="Allow byte match for tokens",
+    show_default=True,
+)
+@click.option(
+    "--magikarp/--no-magikarp",
+    is_flag=True,
+    default=False,
+    help="Filter out poorly trained tokens",
+    show_default=True,
+)
+@click.option(
+    "--new-vocab-noise",
+    "-nvn",
+    type=float,
+    default=None,
+    help="Add gaussian noise to new vocab embeddings",
+    show_default=True,
+)
+@click.option(
+    "--new-vocab-scale",
+    "-nvs",
+    type=float,
+    default=None,
+    help="Scale computed new vocab embeddings by this factor",
+    show_default=True,
+)
+@add_merge_options
+def main(
+    model: str,
+    donor: str,
+    out_path: str,
+    k: int,
+    cosine_similarity: bool,
+    approximation_method: str,
+    weight_scheme: str,
+    subword_method: str,
+    batch_size: Optional[int],
+    prefix_match: str,
+    byte_match: str,
+    magikarp: bool,
+    new_vocab_noise: Optional[float],
+    new_vocab_scale: Optional[float],
+    merge_options: MergeOptions,
+):
+    merge_options.apply_global_options()
+    logging.warning("This script is experimental and may produce unexpected results.")
+    options = TokenSurgeonOptions(
+        model=ModelReference.model_validate(model),
+        donor=ModelReference.model_validate(donor),
+        out_path=out_path,
+        k=k,
+        cosine_similarity=cosine_similarity,
+        method=ApproximationMethod(approximation_method),
+        weight_scheme=WeightingScheme(weight_scheme),
+        subword_method=SubwordMethod(subword_method),
+        batch_size=batch_size,
+        new_vocab_noise=new_vocab_noise,
+        new_vocab_scale=new_vocab_scale,
+    )
+    prefix_match = AllowMatch(prefix_match)
+    byte_match = AllowMatch(byte_match)
+    cache = LoaderCache()
+    cache.setup(options=merge_options)
+    compute_device = torch.device(merge_options.device if merge_options.device else "cuda" if torch.cuda.is_available() else "cpu")
+    storage_device = "cpu"
+    arch_info = get_arch_info(options.model, merge_options)
+    embed_wi, lm_head_wi = get_embedding_info(arch_info)
+    orig_vocab, orig_embed, orig_lm_head = get_stuff(
+        options.model, merge_options, arch_info=arch_info, device=storage_device
+    )
+    donor_vocab, donor_embed, donor_lm_head = get_stuff(
+        options.donor, merge_options, arch_info=None, get_tied=True, device=storage_device
+    )
+    if magikarp:
+        LOG.debug("Finding well-trained tokens in original model")
+        well_trained_orig_tokens = set(
+            well_trained_tokens(
+                orig_vocab,
+                orig_embed,
+                orig_lm_head,
+            )
+        )
+        LOG.debug("Finding well-trained tokens in donor model")
+        well_trained_donor_tokens = set(
+            well_trained_tokens(
+                donor_vocab,
+                donor_embed,
+                donor_lm_head,
+            )
+        )
+        common_well_trained_tokens = (
+            well_trained_orig_tokens & well_trained_donor_tokens
+        )
+        LOG.info(f"Found {len(common_well_trained_tokens)} common well-trained tokens")
+        orig_vocab = {
+            tok: idx
+            for tok, idx in orig_vocab.items()
+            if tok in common_well_trained_tokens
+        }
+        junk_tokens = [
+            idx
+            for tok, idx in donor_vocab.items()
+            if (tok not in well_trained_donor_tokens)
+            and (tok not in well_trained_orig_tokens)
+        ]
+    else:
+        junk_tokens = []
+    if orig_embed is not None:
+        if donor_embed is None:
+            raise RuntimeError(
+                f"Missing tensor {embed_wi.name} in model {options.donor}"
+            )
+        new_embed = build_embedding_matrix(
+            embed_wi,
+            orig_embed,
+            donor_embed,
+            orig_vocab=orig_vocab,
+            donor_vocab=donor_vocab,
+            junk_tokens=junk_tokens,
+            allow_prefix=prefix_match in (AllowMatch.YES, AllowMatch.LM_HEAD_ONLY),
+            allow_byte=byte_match in (AllowMatch.YES, AllowMatch.LM_HEAD_ONLY),
+            is_lm_head=False,
+            options=options,
+            compute_device=compute_device,
+        )
+    else:
+        if not embed_wi.optional:
+            raise RuntimeError(
+                f"Missing tensor {embed_wi.name} in model {options.model}"
+            )
+        new_embed = None
+    if orig_lm_head is not None:
+        if donor_lm_head is None:
+            raise RuntimeError(
+                f"Missing tensor {lm_head_wi.name} in model {options.donor}"
+            )
+        new_lm_head = build_embedding_matrix(
+            lm_head_wi,
+            orig_lm_head,
+            donor_lm_head,
+            orig_vocab=orig_vocab,
+            donor_vocab=donor_vocab,
+            junk_tokens=junk_tokens,
+            allow_prefix=prefix_match in (AllowMatch.YES, AllowMatch.EMBED_ONLY),
+            allow_byte=byte_match in (AllowMatch.YES, AllowMatch.EMBED_ONLY),
+            is_lm_head=True,
+            options=options,
+            compute_device=compute_device,
+        )
+    else:
+        if not lm_head_wi.optional:
+            raise RuntimeError(
+                f"Missing tensor {lm_head_wi.name} in model {options.model}"
+            )
+        new_lm_head = None
+    new_vocab_size = None
+    if new_embed is not None:
+        new_vocab_size = new_embed.shape[0]
+    elif new_lm_head is not None:
+        new_vocab_size = new_lm_head.shape[0]
+    LOG.info(f"Saving new model to {out_path}")
+    out_arch_info = get_out_arch_info(
+        options.model, options.donor, new_vocab_size, merge_options
+    )
+    writer = TensorWriter(
+        out_path,
+        max_shard_size=merge_options.out_shard_size,
+        safe_serialization=merge_options.safe_serialization,
+        use_async=merge_options.async_write,
+        max_write_threads=merge_options.write_threads,
+    )
+    for weight_info in tqdm.tqdm(out_arch_info.all_weights(), desc="Saving weights"):
+        if weight_info.name == embed_wi.name:
+            tensor = new_embed
+        elif lm_head_wi is not None and weight_info.name == lm_head_wi.name:
+            tensor = new_lm_head
+        else:
+            tensor = cache.get(options.model).get_tensor(
+                weight_info.name, aliases=weight_info.aliases, raise_on_missing=False
+            )
+        if tensor is None:
+            if weight_info.optional:
+                continue
+            raise RuntimeError(
+                f"Missing tensor {weight_info.name} in model {options.model}"
+            )
+        writer.save_tensor(weight_info.name, tensor, clone=merge_options.clone_tensors)
+    # Force close lazy loader file handles so Windows allows deletion/renaming
+    cache.flush_all()
+    import gc
+    gc.collect()
+    # Delete original safetensors files to prevent FileExistsError during rename
+    import os
+    import re
+    temp_pattern = re.compile(r"^.*-\d+\.safetensors$")
+    for fname in os.listdir(out_path):
+        if fname.endswith(".safetensors") and not temp_pattern.match(fname):
+            try:
+                os.remove(os.path.join(out_path, fname))
+            except Exception as e:
+                LOG.warning(f"Could not remove old file {fname}: {e}")
+        elif fname == "model.safetensors.index.json":
+            try:
+                os.remove(os.path.join(out_path, fname))
+            except Exception:
+                pass
+    writer.finalize()
+    out_arch_info.config.save_pretrained(out_path)
+    tokenizer_out = transformers.AutoTokenizer.from_pretrained(
+        options.donor.model.path,
+        revision=options.donor.model.revision,
+        trust_remote_code=merge_options.trust_remote_code,
+    )
+    tokenizer_out.save_pretrained(out_path)
+    # Also copy generation_config.json if it exists in the donor
+    donor_gen_config = os.path.join(options.donor.model.path, "generation_config.json")
+    if os.path.exists(donor_gen_config):
+        import shutil
+        shutil.copy(donor_gen_config, os.path.join(out_path, "generation_config.json"))
+    LOG.info("Done!")
+if __name__ == "__main__":
+    main()