{"id":5314,"date":"2025-01-28T06:19:45","date_gmt":"2025-01-28T06:19:45","guid":{"rendered":"https:\/\/toshareproject.it\/artmakerblog\/?p=5314"},"modified":"2025-01-28T06:19:45","modified_gmt":"2025-01-28T06:19:45","slug":"the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm","status":"publish","type":"post","link":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/","title":{"rendered":"The DeepSeek V3 model file is ~450 lines of code in MLX LM."},"content":{"rendered":"<p>*That&#8217;s not much code.  I could blog that much code.<\/p>\n<p>https:\/\/github.com\/ml-explore\/mlx-examples\/blob\/main\/llms\/mlx_lm\/models\/deepseek_v3.py<\/p>\n<p># Copyright \u00a9 2024 Apple Inc.<\/p>\n<p>import math<br \/>\nfrom dataclasses import dataclass<br \/>\nfrom typing import Any, Dict, Optional, Tuple<\/p>\n<p>import mlx.core as mx<br \/>\nimport mlx.nn as nn<\/p>\n<p>from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention<br \/>\nfrom .switch_layers import SwitchGLU<\/p>\n<p>@dataclass<br \/>\nclass ModelArgs(BaseModelArgs):<br \/>\n    model_type: str = &#8220;deepseek_v3&#8221;<br \/>\n    vocab_size: int = 102400<br \/>\n    hidden_size: int = 4096<br \/>\n    intermediate_size: int = 11008<br \/>\n    moe_intermediate_size: int = 1407<br \/>\n    num_hidden_layers: int = 30<br \/>\n    num_attention_heads: int = 32<br \/>\n    num_key_value_heads: int = 32<br \/>\n    n_shared_experts: Optional[int] = None<br \/>\n    n_routed_experts: Optional[int] = None<br \/>\n    routed_scaling_factor: float = 1.0<br \/>\n    kv_lora_rank: int = 512<br \/>\n    q_lora_rank: int = 1536<br \/>\n    qk_rope_head_dim: int = 64<br \/>\n    v_head_dim: int = 128<br \/>\n    qk_nope_head_dim: int = 128<br \/>\n    topk_method: str = &#8220;noaux_tc&#8221;<br \/>\n    scoring_func: str = &#8220;sigmoid&#8221;<br \/>\n    norm_topk_prob: bool = True<br \/>\n    n_group: Optional[int] = None<br \/>\n    topk_group: Optional[int] = None<br \/>\n    num_experts_per_tok: Optional[int] = None<br \/>\n    moe_layer_freq: int = 1<br \/>\n    first_k_dense_replace: int = 0<br \/>\n    max_position_embeddings: int = 2048<br \/>\n    rms_norm_eps: float = 1e-6<br \/>\n    rope_theta: float = 10000.0<br \/>\n    rope_scaling: Dict = None<br \/>\n    attention_bias: bool = False<\/p>\n<p>def yarn_find_correction_dim(<br \/>\n    num_rotations, dim, base=10000, max_position_embeddings=2048<br \/>\n):<br \/>\n    return (dim * math.log(max_position_embeddings \/ (num_rotations * 2 * math.pi))) \/ (<br \/>\n        2 * math.log(base)<br \/>\n    )<\/p>\n<p>def yarn_find_correction_range(<br \/>\n    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048<br \/>\n):<br \/>\n    low = math.floor(<br \/>\n        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)<br \/>\n    )<br \/>\n    high = math.ceil(<br \/>\n        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)<br \/>\n    )<br \/>\n    return max(low, 0), min(high, dim &#8211; 1)<\/p>\n<p>def yarn_get_mscale(scale=1, mscale=1):<br \/>\n    if scale <= 1:\n        return 1.0\n    return 0.1 * mscale * math.log(scale) + 1.0\n\n\ndef yarn_linear_ramp_mask(min_val, max_val, dim):\n    if min_val == max_val:\n        max_val += 0.001  # Prevent singularity\n\n    linear_func = (mx.arange(dim, dtype=mx.float32) - min_val) \/ (max_val - min_val)\n    return mx.clip(linear_func, 0, 1)\n\n\nclass DeepseekV3YarnRotaryEmbedding(nn.Module):\n    def __init__(\n        self,\n        dim,\n        max_position_embeddings=2048,\n        base=10000,\n        scaling_factor=1.0,\n        original_max_position_embeddings=4096,\n        beta_fast=32,\n        beta_slow=1,\n        mscale=1,\n        mscale_all_dim=0,\n    ):\n        super().__init__()\n        self.mscale = yarn_get_mscale(scaling_factor, mscale) \/ yarn_get_mscale(\n            scaling_factor, mscale_all_dim\n        )\n        freq_extra = base ** (mx.arange(0, dim, 2, dtype=mx.float32) \/ dim)\n        freq_inter = scaling_factor * base ** (\n            mx.arange(0, dim, 2, dtype=mx.float32) \/ dim\n        )\n        low, high = yarn_find_correction_range(\n            beta_fast,\n            beta_slow,\n            dim,\n            base,\n            original_max_position_embeddings,\n        )\n        freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim \/\/ 2)\n        self._freqs = (freq_inter * freq_extra) \/ (\n            freq_inter * freq_mask + freq_extra * (1 - freq_mask)\n        )\n\n    def __call__(self, x, offset=0):\n        if self.mscale != 1.0:\n            x = self.mscale * x\n        return mx.fast.rope(\n            x,\n            x.shape[-1],\n            traditional=True,\n            base=None,\n            scale=1.0,\n            offset=offset,\n            freqs=self._freqs,\n        )\n\n\nclass DeepseekV3Attention(nn.Module):\n    def __init__(self, config: ModelArgs):\n        super().__init__()\n        self.config = config\n        self.hidden_size = config.hidden_size\n        self.num_heads = config.num_attention_heads\n        self.max_position_embeddings = config.max_position_embeddings\n        self.rope_theta = config.rope_theta\n        self.q_lora_rank = config.q_lora_rank\n        self.qk_rope_head_dim = config.qk_rope_head_dim\n        self.kv_lora_rank = config.kv_lora_rank\n        self.v_head_dim = config.v_head_dim\n        self.qk_nope_head_dim = config.qk_nope_head_dim\n        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim\n\n        self.scale = self.q_head_dim**-0.5\n\n        if self.q_lora_rank is None:\n            self.q_proj = nn.Linear(\n                self.hidden_size, self.num_heads * self.q_head_dim, bias=False\n            )\n        else:\n            self.q_a_proj = nn.Linear(\n                self.hidden_size, self.q_lora_rank, bias=config.attention_bias\n            )\n            self.q_a_layernorm = nn.RMSNorm(self.q_lora_rank)\n            self.q_b_proj = nn.Linear(\n                self.q_lora_rank, self.num_heads * self.q_head_dim, bias=False\n            )\n\n        self.kv_a_proj_with_mqa = nn.Linear(\n            self.hidden_size,\n            self.kv_lora_rank + self.qk_rope_head_dim,\n            bias=config.attention_bias,\n        )\n        self.kv_a_layernorm = nn.RMSNorm(self.kv_lora_rank)\n        self.kv_b_proj = nn.Linear(\n            self.kv_lora_rank,\n            self.num_heads\n            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),\n            bias=False,\n        )\n\n        self.o_proj = nn.Linear(\n            self.num_heads * self.v_head_dim,\n            self.hidden_size,\n            bias=config.attention_bias,\n        )\n\n        mscale_all_dim = self.config.rope_scaling.get(\"mscale_all_dim\", 0)\n        scaling_factor = self.config.rope_scaling[\"factor\"]\n        if mscale_all_dim:\n            mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)\n            self.scale = self.scale * mscale * mscale\n\n        rope_kwargs = {\n            key: self.config.rope_scaling[key]\n            for key in [\n                \"original_max_position_embeddings\",\n                \"beta_fast\",\n                \"beta_slow\",\n                \"mscale\",\n                \"mscale_all_dim\",\n            ]\n            if key in self.config.rope_scaling\n        }\n        self.rope = DeepseekV3YarnRotaryEmbedding(\n            dim=self.qk_rope_head_dim,\n            max_position_embeddings=self.max_position_embeddings,\n            scaling_factor=scaling_factor,\n            base=self.rope_theta,\n            **rope_kwargs,\n        )\n\n    def __call__(\n        self,\n        x: mx.array,\n        mask: Optional[mx.array] = None,\n        cache: Optional[Any] = None,\n    ) -> mx.array:<br \/>\n        B, L, D = x.shape<\/p>\n<p>        if self.q_lora_rank is None:<br \/>\n            q = self.q_proj(x)<br \/>\n        else:<br \/>\n            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(x)))<\/p>\n<p>        q = q.reshape(B, L, self.num_heads, self.q_head_dim).transpose(0, 2, 1, 3)<br \/>\n        q_nope, q_pe = mx.split(q, [self.qk_nope_head_dim], axis=-1)<br \/>\n        compressed_kv = self.kv_a_proj_with_mqa(x)<br \/>\n        compressed_kv, k_pe = mx.split(compressed_kv, [self.kv_lora_rank], axis=-1)<br \/>\n        k_pe = k_pe.reshape(B, L, 1, self.qk_rope_head_dim).transpose(0, 2, 1, 3)<br \/>\n        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv))<br \/>\n        kv = kv.reshape(B, L, self.num_heads, -1).transpose(0, 2, 1, 3)<\/p>\n<p>        k_nope, values = mx.split(kv, [self.qk_nope_head_dim], axis=-1)<\/p>\n<p>        if cache is not None:<br \/>\n            q_pe = self.rope(q_pe, cache.offset)<br \/>\n            k_pe = self.rope(k_pe, cache.offset)<br \/>\n            k_pe = mx.repeat(k_pe, self.num_heads, axis=1)<br \/>\n            keys, values = cache.update_and_fetch(<br \/>\n                mx.concatenate([k_nope, k_pe], axis=-1), values<br \/>\n            )<br \/>\n        else:<br \/>\n            q_pe = self.rope(q_pe)<br \/>\n            k_pe = self.rope(k_pe)<br \/>\n            k_pe = mx.repeat(k_pe, self.num_heads, axis=1)<br \/>\n            keys = mx.concatenate([k_nope, k_pe], axis=-1)<\/p>\n<p>        queries = mx.concatenate([q_nope, q_pe], axis=-1)<\/p>\n<p>        output = scaled_dot_product_attention(<br \/>\n            queries, keys, values, cache=cache, scale=self.scale, mask=mask<br \/>\n        )<br \/>\n        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)<br \/>\n        return self.o_proj(output)<\/p>\n<p>class DeepseekV3MLP(nn.Module):<br \/>\n    def __init__(<br \/>\n        self, config: ModelArgs, hidden_size: int = None, intermediate_size: int = None<br \/>\n    ):<br \/>\n        super().__init__()<br \/>\n        self.config = config<br \/>\n        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size<br \/>\n        self.intermediate_size = (<br \/>\n            config.intermediate_size if intermediate_size is None else intermediate_size<br \/>\n        )<\/p>\n<p>        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)<br \/>\n        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)<br \/>\n        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)<\/p>\n<p>    def __call__(self, x):<br \/>\n        down_proj = self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))<br \/>\n        return down_proj<\/p>\n<p>class MoEGate(nn.Module):<br \/>\n    def __init__(self, config: ModelArgs):<br \/>\n        super().__init__()<br \/>\n        self.config = config<br \/>\n        self.top_k = config.num_experts_per_tok<br \/>\n        self.norm_topk_prob = config.norm_topk_prob<br \/>\n        self.n_routed_experts = config.n_routed_experts<br \/>\n        self.routed_scaling_factor = config.routed_scaling_factor<br \/>\n        self.topk_method = config.topk_method<br \/>\n        self.n_group = config.n_group<br \/>\n        self.topk_group = config.topk_group<br \/>\n        self.weight = mx.zeros((self.n_routed_experts, config.hidden_size))<br \/>\n        self.e_score_correction_bias = mx.zeros((self.n_routed_experts,))<\/p>\n<p>    def __call__(self, x):<br \/>\n        gates = x @ self.weight.T<\/p>\n<p>        scores = mx.sigmoid(gates.astype(mx.float32))<\/p>\n<p>        assert self.topk_method == &#8220;noaux_tc&#8221;, &#8220;Unsupported topk method.&#8221;<br \/>\n        bsz, seq_len = x.shape[:2]<br \/>\n        scores = scores + self.e_score_correction_bias<br \/>\n        scores = scores.reshape(bsz, seq_len, self.n_group, -1)<br \/>\n        group_scores = mx.topk(scores, 2, axis=-1).sum(axis=-1)<br \/>\n        k = self.n_group &#8211; self.topk_group<br \/>\n        group_idx = mx.argpartition(group_scores, kth=k &#8211; 1, axis=-1)[&#8230;, :k]<br \/>\n        batch_idx = mx.expand_dims(mx.arange(bsz), (1, 2))<br \/>\n        seq_idx = mx.expand_dims(mx.arange(seq_len), (0, 2))<br \/>\n        scores[batch_idx, seq_idx, group_idx] = 0.0<br \/>\n        scores = scores.reshape(bsz, seq_len, -1)<\/p>\n<p>        k = self.top_k<br \/>\n        inds = mx.argpartition(-scores, kth=k &#8211; 1, axis=-1)[&#8230;, :k]<br \/>\n        scores = mx.take_along_axis(scores, inds, axis=-1)<br \/>\n        if self.top_k > 1 and self.norm_topk_prob:<br \/>\n            denominator = scores.sum(axis=-1, keepdims=True) + 1e-20<br \/>\n            scores = scores \/ denominator<br \/>\n        scores = scores * self.routed_scaling_factor<\/p>\n<p>        return inds, scores<\/p>\n<p>class DeepseekV3MoE(nn.Module):<br \/>\n    def __init__(self, config: ModelArgs):<br \/>\n        super().__init__()<br \/>\n        self.config = config<br \/>\n        self.num_experts_per_tok = config.num_experts_per_tok<br \/>\n        self.switch_mlp = SwitchGLU(<br \/>\n            config.hidden_size, config.moe_intermediate_size, config.n_routed_experts<br \/>\n        )<\/p>\n<p>        self.gate = MoEGate(config)<br \/>\n        if config.n_shared_experts is not None:<br \/>\n            intermediate_size = config.moe_intermediate_size * config.n_shared_experts<br \/>\n            self.shared_experts = DeepseekV3MLP(<br \/>\n                config=config, intermediate_size=intermediate_size<br \/>\n            )<\/p>\n<p>    def __call__(self, x):<br \/>\n        inds, scores = self.gate(x)<br \/>\n        y = self.switch_mlp(x, inds)<br \/>\n        y = (y * scores[&#8230;, None]).sum(axis=-2).astype(y.dtype)<br \/>\n        if self.config.n_shared_experts is not None:<br \/>\n            y = y + self.shared_experts(x)<\/p>\n<p>        return y<\/p>\n<p>class DeepseekV3DecoderLayer(nn.Module):<br \/>\n    def __init__(self, config: ModelArgs, layer_idx: int):<br \/>\n        super().__init__()<br \/>\n        self.self_attn = DeepseekV3Attention(config)<br \/>\n        self.mlp = (<br \/>\n            DeepseekV3MoE(config)<br \/>\n            if (<br \/>\n                config.n_routed_experts is not None<br \/>\n                and layer_idx >= config.first_k_dense_replace<br \/>\n                and layer_idx % config.moe_layer_freq == 0<br \/>\n            )<br \/>\n            else DeepseekV3MLP(config)<br \/>\n        )<br \/>\n        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)<br \/>\n        self.post_attention_layernorm = nn.RMSNorm(<br \/>\n            config.hidden_size, eps=config.rms_norm_eps<br \/>\n        )<\/p>\n<p>    def __call__(<br \/>\n        self,<br \/>\n        x: mx.array,<br \/>\n        mask: Optional[mx.array] = None,<br \/>\n        cache: Optional[Any] = None,<br \/>\n    ) -> mx.array:<br \/>\n        r = self.self_attn(self.input_layernorm(x), mask, cache)<br \/>\n        h = x + r<br \/>\n        r = self.mlp(self.post_attention_layernorm(h))<br \/>\n        out = h + r<br \/>\n        # Protect against overflow for fp16<br \/>\n        if out.dtype == mx.float16:<br \/>\n            out = mx.clip(out, a_min=None, a_max=mx.finfo(mx.float16).max &#8211; 1000)<br \/>\n        return out<\/p>\n<p>class DeepseekV3Model(nn.Module):<br \/>\n    def __init__(self, config: ModelArgs):<br \/>\n        super().__init__()<br \/>\n        self.vocab_size = config.vocab_size<br \/>\n        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)<br \/>\n        self.layers = [<br \/>\n            DeepseekV3DecoderLayer(config, idx)<br \/>\n            for idx in range(config.num_hidden_layers)<br \/>\n        ]<br \/>\n        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)<br \/>\n        self.pipeline_rank = 0<br \/>\n        self.pipeline_size = 1<\/p>\n<p>    def pipeline(self, group):<br \/>\n        # Split layers in reverse so rank=0 gets the last layers and<br \/>\n        # rank=pipeline_size-1 gets the first<br \/>\n        self.pipeline_rank = group.rank()<br \/>\n        self.pipeline_size = group.size()<br \/>\n        layers_per_rank = (<br \/>\n            len(self.layers) + self.pipeline_size &#8211; 1<br \/>\n        ) \/\/ self.pipeline_size<br \/>\n        start = (self.pipeline_size &#8211; self.pipeline_rank &#8211; 1) * layers_per_rank<br \/>\n        self.layers = self.layers[start : start + layers_per_rank]<\/p>\n<p>    def __call__(<br \/>\n        self,<br \/>\n        x: mx.array,<br \/>\n        cache: Optional[Any] = None,<br \/>\n        mask: Optional[mx.array] = None,<br \/>\n    ) -> mx.array:<br \/>\n        h = self.embed_tokens(x)<\/p>\n<p>        pipeline_rank = self.pipeline_rank<br \/>\n        pipeline_size = self.pipeline_size<br \/>\n        # Hack to avoid time-outs during prompt-processing<br \/>\n        dist_stream = mx.cpu if h.shape[1] > 1 else mx.gpu<br \/>\n        if mask is None:<br \/>\n            mask = create_attention_mask(h, cache)<\/p>\n<p>        if cache is None:<br \/>\n            cache = [None] * len(self.layers)<\/p>\n<p>        # Receive from the previous process in the pipeline<\/p>\n<p>        if pipeline_rank < pipeline_size - 1:\n            h = mx.distributed.recv_like(h, (pipeline_rank + 1), stream=dist_stream)\n\n        for layer, c in zip(self.layers, cache):\n            h = layer(h, mask, c)\n\n        # Send to the next process in the pipeline\n        if pipeline_rank != 0:\n            h = mx.distributed.send(\n                h, (pipeline_rank - 1) % pipeline_size, stream=dist_stream\n            )\n\n        # Broadcast h while keeping it in the graph\n        h = mx.distributed.all_gather(h, stream=dist_stream)[: h.shape[0]]\n\n        return self.norm(h)\n\n\nclass Model(nn.Module):\n    def __init__(self, config: ModelArgs):\n        super().__init__()\n        self.args = config\n        self.model_type = config.model_type\n        self.model = DeepseekV3Model(config)\n        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)\n\n    def __call__(\n        self,\n        inputs: mx.array,\n        cache: Optional[Any] = None,\n        mask: Optional[mx.array] = None,\n    ):\n        out = self.model(inputs, cache, mask)\n        return self.lm_head(out)\n\n    def sanitize(self, weights):\n        for l in range(self.args.num_hidden_layers):\n            prefix = f\"model.layers.{l}\"\n            for n, m in [(\"w1\", \"gate_proj\"), (\"w2\", \"down_proj\"), (\"w3\", \"up_proj\")]:\n                for k in [\"weight\", \"scales\", \"biases\"]:\n                    if f\"{prefix}.mlp.experts.0.{m}.{k}\" in weights:\n                        to_join = [\n                            weights.pop(f\"{prefix}.mlp.experts.{e}.{m}.{k}\")\n                            for e in range(self.args.n_routed_experts)\n                        ]\n                        weights[f\"{prefix}.mlp.switch_mlp.{m}.{k}\"] = mx.stack(to_join)\n\n        # Remove multi-token prediction layer\n        return {k: v for k, v in weights.items() if not k.startswith(\"model.layers.61\")}\n\n    @property\n    def layers(self):\n        return self.model.layers\n<\/p>\n","protected":false},"excerpt":{"rendered":"<p>*That&#8217;s not much code. I could blog that much code. https:\/\/github.com\/ml-explore\/mlx-examples\/blob\/main\/llms\/mlx_lm\/models\/deepseek_v3.py # Copyright \u00a9 2024 Apple Inc. import math from dataclasses import dataclass from typing import Any, Dict, Optional, Tuple import mlx.core as mx import mlx.nn as nn from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention from .switch_layers import SwitchGLU @dataclass class ModelArgs(BaseModelArgs): model_type: str = &#8220;deepseek_v3&#8221; [&hellip;]<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"class_list":["post-5314","post","type-post","status-publish","format-standard","hentry","category-uncategorised"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v17.0 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog<\/title>\n<meta name=\"description\" content=\"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog\" \/>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/\" \/>\n<meta property=\"og:locale\" content=\"en_GB\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog\" \/>\n<meta property=\"og:description\" content=\"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog\" \/>\n<meta property=\"og:url\" content=\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/\" \/>\n<meta property=\"og:site_name\" content=\"Artmaker Blog\" \/>\n<meta property=\"article:published_time\" content=\"2025-01-28T06:19:45+00:00\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"Bruce Sterling\" \/>\n\t<meta name=\"twitter:label2\" content=\"Estimated reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"8 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebSite\",\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/#website\",\"url\":\"https:\/\/toshareproject.it\/artmakerblog\/\",\"name\":\"Artmaker Blog\",\"description\":\"on Toshareproject.it - curated by Bruce Sterling\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/toshareproject.it\/artmakerblog\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"en-GB\"},{\"@type\":\"WebPage\",\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/#webpage\",\"url\":\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/\",\"name\":\"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog\",\"isPartOf\":{\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/#website\"},\"datePublished\":\"2025-01-28T06:19:45+00:00\",\"dateModified\":\"2025-01-28T06:19:45+00:00\",\"author\":{\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/#\/schema\/person\/6f20726ed2761431f3e0ff4e096c3085\"},\"description\":\"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog\",\"breadcrumb\":{\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/#breadcrumb\"},\"inLanguage\":\"en-GB\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/toshareproject.it\/artmakerblog\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"The DeepSeek V3 model file is ~450 lines of code in MLX LM.\"}]},{\"@type\":\"Person\",\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/#\/schema\/person\/6f20726ed2761431f3e0ff4e096c3085\",\"name\":\"Bruce Sterling\",\"image\":{\"@type\":\"ImageObject\",\"@id\":\"https:\/\/toshareproject.it\/artmakerblog\/#personlogo\",\"inLanguage\":\"en-GB\",\"url\":\"https:\/\/secure.gravatar.com\/avatar\/c390e8ed4db57a34278dcf667f928a643cf769a865c8a8632dcd310412bb9a99?s=96&d=mm&r=g\",\"contentUrl\":\"https:\/\/secure.gravatar.com\/avatar\/c390e8ed4db57a34278dcf667f928a643cf769a865c8a8632dcd310412bb9a99?s=96&d=mm&r=g\",\"caption\":\"Bruce Sterling\"},\"description\":\"Art director at Share Festival, author and journalist\",\"sameAs\":[\"http:\/\/toshareproject.it\/tomorrowart\"],\"url\":\"https:\/\/toshareproject.it\/artmakerblog\/author\/brucesterling\/\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog","description":"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/","og_locale":"en_GB","og_type":"article","og_title":"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog","og_description":"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog","og_url":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/","og_site_name":"Artmaker Blog","article_published_time":"2025-01-28T06:19:45+00:00","twitter_card":"summary_large_image","twitter_misc":{"Written by":"Bruce Sterling","Estimated reading time":"8 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebSite","@id":"https:\/\/toshareproject.it\/artmakerblog\/#website","url":"https:\/\/toshareproject.it\/artmakerblog\/","name":"Artmaker Blog","description":"on Toshareproject.it - curated by Bruce Sterling","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/toshareproject.it\/artmakerblog\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"en-GB"},{"@type":"WebPage","@id":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/#webpage","url":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/","name":"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog","isPartOf":{"@id":"https:\/\/toshareproject.it\/artmakerblog\/#website"},"datePublished":"2025-01-28T06:19:45+00:00","dateModified":"2025-01-28T06:19:45+00:00","author":{"@id":"https:\/\/toshareproject.it\/artmakerblog\/#\/schema\/person\/6f20726ed2761431f3e0ff4e096c3085"},"description":"The DeepSeek V3 model file is ~450 lines of code in MLX LM. | Artmaker Blog","breadcrumb":{"@id":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/#breadcrumb"},"inLanguage":"en-GB","potentialAction":[{"@type":"ReadAction","target":["https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/toshareproject.it\/artmakerblog\/the-deepseek-v3-model-file-is-450-lines-of-code-in-mlx-lm\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/toshareproject.it\/artmakerblog\/"},{"@type":"ListItem","position":2,"name":"The DeepSeek V3 model file is ~450 lines of code in MLX LM."}]},{"@type":"Person","@id":"https:\/\/toshareproject.it\/artmakerblog\/#\/schema\/person\/6f20726ed2761431f3e0ff4e096c3085","name":"Bruce Sterling","image":{"@type":"ImageObject","@id":"https:\/\/toshareproject.it\/artmakerblog\/#personlogo","inLanguage":"en-GB","url":"https:\/\/secure.gravatar.com\/avatar\/c390e8ed4db57a34278dcf667f928a643cf769a865c8a8632dcd310412bb9a99?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/c390e8ed4db57a34278dcf667f928a643cf769a865c8a8632dcd310412bb9a99?s=96&d=mm&r=g","caption":"Bruce Sterling"},"description":"Art director at Share Festival, author and journalist","sameAs":["http:\/\/toshareproject.it\/tomorrowart"],"url":"https:\/\/toshareproject.it\/artmakerblog\/author\/brucesterling\/"}]}},"_links":{"self":[{"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/posts\/5314","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/comments?post=5314"}],"version-history":[{"count":1,"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/posts\/5314\/revisions"}],"predecessor-version":[{"id":5315,"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/posts\/5314\/revisions\/5315"}],"wp:attachment":[{"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/media?parent=5314"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/categories?post=5314"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/toshareproject.it\/artmakerblog\/wp-json\/wp\/v2\/tags?post=5314"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}