ONNX model for GLM-OCR

try with ningpp/flux

Flux is a Java-based OCR

Attention

If you download model before 2026-03-07, you can download model again, current version of the model has better inference performance.

ONNX Inference

"""
End-to-end ONNX inference for GLM-OCR model.

This script performs complete inference using exported ONNX models:
1. Vision encoder (processes images)
2. Embedding layer (converts token IDs to embeddings)
3. Prefill model (processes prompt)
4. Decode model (generates tokens autoregressively)

Usage:
    python onnx_inference_e2e.py --image <path> --max-tokens 100
    python onnx_inference_e2e.py --use-real-images --max-tokens 100
"""

import os
import sys
import time
import argparse
from typing import List, Tuple, Optional
from PIL import Image
import numpy as np
import onnxruntime as ort
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoConfig


class GLMOcrOnnxInference:
    """End-to-end ONNX inference for GLM-OCR."""

    def __init__(self, onnx_dir: str, device: str = "cpu"):
        """
        Initialize ONNX inference sessions.

        Args:
            onnx_dir: Directory containing exported ONNX models
            device: "cpu" or "cuda"
        """
        self.onnx_dir = onnx_dir
        self.device = device
        self.providers = ["CUDAExecutionProvider"] if device == "cuda" else ["CPUExecutionProvider"]

        # Load processor for tokenization
        print(f"Loading processor from {onnx_dir}...")
        self.processor = AutoProcessor.from_pretrained(onnx_dir, trust_remote_code=True)

        # Model config
        self.config = self._load_config()

        # Create ONNX sessions
        self.sessions = self._create_sessions()

    def _load_config(self):
        """Load model configuration without loading the entire model."""
        # Load config directly instead of the entire model
        config = AutoConfig.from_pretrained(self.onnx_dir, trust_remote_code=True)
        return config

    def _create_sessions(self) -> dict:
        """Create ONNX Runtime sessions for all models."""
        print("Creating ONNX Runtime sessions...")

        opts = ort.SessionOptions()
        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        
        if self.device == "cuda":
            # CUDA-specific optimizations
            opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
            opts.enable_mem_pattern = True
            opts.enable_mem_reuse = True
        else:
            # CPU optimizations
            opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
            import multiprocessing
            num_cores = multiprocessing.cpu_count()
            opts.intra_op_num_threads = num_cores
            opts.inter_op_num_threads = 1

        sessions = {}

        # Get available providers and set up CUDA options
        if self.device == "cuda":
            available_providers = ort.get_available_providers()
            providers = []
            
            # Try TensorRT first if available (best performance)
            if "TensorrtExecutionProvider" in available_providers:
                print("  TensorRT is available but disabled temporarily due to shape inference requirements")
                # Commented out until we run shape inference on the model
                # providers.append(("TensorrtExecutionProvider", {
                #     "trt_engine_cache_enable": True,
                #     "trt_engine_cache_path": "./trt_cache",
                #     "trt_fp16_enable": True,
                # }))
                # print("  Using TensorRT Execution Provider")
            
            # Always add CUDAExecutionProvider
            providers.append(("CUDAExecutionProvider", {
                "device_id": 0,
                "arena_extend_strategy": "kNextPowerOfTwo",
                "cudnn_conv_algo_search": "EXHAUSTIVE",
                "do_copy_in_default_stream": True,
            }))
            
            # Fallback to CPU
            providers.append("CPUExecutionProvider")
        else:
            providers = self.providers

        # Vision encoder
        vision_path = os.path.join(self.onnx_dir, "vision_encoder_fused.onnx")
        if os.path.exists(vision_path):
            sessions["vision"] = ort.InferenceSession(
                vision_path, opts, providers=providers
            )
            print(f"  ✓ Vision encoder loaded")

        # Embedding layer
        embedding_path = os.path.join(self.onnx_dir, "embedding.onnx")
        if os.path.exists(embedding_path):
            sessions["embedding"] = ort.InferenceSession(
                embedding_path, opts, providers=providers
            )
            print(f"  ✓ Embedding layer loaded")

        # Prefill model
        prefill_path = os.path.join(self.onnx_dir, "llm_prefill.onnx")
        if os.path.exists(prefill_path):
            sessions["prefill"] = ort.InferenceSession(
                prefill_path, opts, providers=providers
            )
            print(f"  ✓ Prefill model loaded")

        # Decode model
        decode_path = os.path.join(self.onnx_dir, "llm_decode.onnx")
        if os.path.exists(decode_path):
            sessions["decode"] = ort.InferenceSession(
                decode_path, opts, providers=providers
            )
            print(f"  ✓ Decode model loaded")

        return sessions

    def encode_image(self, image_path: str) -> np.ndarray:
        """
        Encode image using vision encoder.

        Args:
            image_path: Path to image file

        Returns:
            Image features as numpy array
        """
        if "vision" not in self.sessions:
            raise RuntimeError("Vision encoder not available")

        # Load and preprocess image
        image = Image.open(image_path).convert("RGB")

        # Use full processor to get all necessary inputs (pixel_values, grid_thw)
        messages = [{'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': 'test'}]}]
        text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = self.processor(text=text, images=[image], return_tensors='pt')
        
        pixel_values = inputs.pixel_values
        grid_thw = inputs.image_grid_thw
        
        # Compute pos_ids and max_grid_size
        pos_ids, max_grid_size = self._compute_pos_ids(grid_thw)
        
        # Convert to numpy arrays
        pixel_values_np = pixel_values.numpy()
        pos_ids_np = pos_ids.numpy()
        max_grid_size_np = np.array(max_grid_size, dtype=np.int64)

        # Run vision encoder
        outputs = self.sessions["vision"].run(None, {
            "pixel_values": pixel_values_np,
            "pos_ids": pos_ids_np,
            "max_grid_size": max_grid_size_np
        })

        return outputs[0]  # image_features
    
    def _compute_pos_ids(self, grid_thw, spatial_merge_size: int = 2):
        """
        Pre-compute position IDs for rotary embeddings.
        
        Args:
            grid_thw: [batch_size, 3] - (temporal, height_patches, width_patches) for each image
            spatial_merge_size: The spatial merge factor (default 2)
            
        Returns:
            pos_ids: [total_patches, 2] - position indices for all patches
            max_grid_size: int - maximum grid dimension
        """
        import torch
        pos_ids_list = []
        for t, h, w in grid_thw:
            t, h, w = int(t), int(h), int(w)
            
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            hpos_ids = hpos_ids.reshape(
                h // spatial_merge_size,
                spatial_merge_size,
                w // spatial_merge_size,
                spatial_merge_size,
            )
            hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
            
            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            wpos_ids = wpos_ids.reshape(
                h // spatial_merge_size,
                spatial_merge_size,
                w // spatial_merge_size,
                spatial_merge_size,
            )
            wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
            
            pos_ids_list.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
        
        pos_ids = torch.cat(pos_ids_list, dim=0)
        max_grid_size = int(grid_thw[:, 1:].max())
        
        return pos_ids, max_grid_size

    def _get_rope_index(self, input_ids_list, image_grid_thw, attention_mask_list=None):
        """
        Calculate position_ids for M-RoPE (same logic as PyTorch's get_rope_index).
        
        Args:
            input_ids_list: List of input token IDs
            image_grid_thw: Tensor of [t, h, w] for image grid
            attention_mask_list: List of attention mask values
        
        Returns:
            position_ids: numpy array of shape [3, seq_len]
            rope_deltas: int, the delta for decode position calculation
        """
        import itertools
        
        spatial_merge_size = self.config.vision_config.spatial_merge_size
        image_token_id = self.config.image_token_id
        
        # Get image grid dimensions
        t, h, w = image_grid_thw[0][0].item(), image_grid_thw[0][1].item(), image_grid_thw[0][2].item()
        llm_grid_t = t
        llm_grid_h = h // spatial_merge_size
        llm_grid_w = w // spatial_merge_size
        
        # Find image token positions
        boi_token_id = 59256  # 
        eoi_token_id = 59257  # 
        
        # Build position_ids
        seq_len = len(input_ids_list)
        position_ids = np.zeros((3, seq_len), dtype=np.int64)
        
        # Find BOI and EOI positions
        boi_pos = None
        eoi_pos = None
        for i, tid in enumerate(input_ids_list):
            if tid == boi_token_id:
                boi_pos = i
            elif tid == eoi_token_id:
                eoi_pos = i
        
        if boi_pos is None or eoi_pos is None:
            # No image tokens, use simple position_ids
            for i in range(seq_len):
                position_ids[0, i] = i
                position_ids[1, i] = i
                position_ids[2, i] = i
            return position_ids, 0
        
        # Text tokens before image
        for i in range(boi_pos):
            position_ids[0, i] = i
            position_ids[1, i] = i
            position_ids[2, i] = i
        
        # BOI token
        st_idx = boi_pos
        position_ids[0, boi_pos] = st_idx
        position_ids[1, boi_pos] = st_idx
        position_ids[2, boi_pos] = st_idx
        
        # Image tokens - use 3D position encoding
        # t_index, h_index, w_index for each image token
        img_start = boi_pos + 1
        img_end = eoi_pos
        
        for idx, pos in enumerate(range(img_start, img_end)):
            t_idx = idx // (llm_grid_h * llm_grid_w)
            hw_idx = idx % (llm_grid_h * llm_grid_w)
            h_idx = hw_idx // llm_grid_w
            w_idx = hw_idx % llm_grid_w
            
            position_ids[0, pos] = st_idx + t_idx
            position_ids[1, pos] = st_idx + h_idx
            position_ids[2, pos] = st_idx + w_idx
        
        # EOI token and text after
        max_img_pos = max(
            position_ids[0, img_start:img_end].max(),
            position_ids[1, img_start:img_end].max(),
            position_ids[2, img_start:img_end].max()
        )
        
        for i, pos in enumerate(range(eoi_pos, seq_len)):
            position_ids[0, pos] = max_img_pos + 1 + i
            position_ids[1, pos] = max_img_pos + 1 + i
            position_ids[2, pos] = max_img_pos + 1 + i
        
        # Calculate rope_deltas
        max_pos = max(
            position_ids[0].max(),
            position_ids[1].max(),
            position_ids[2].max()
        )
        rope_deltas = max_pos + 1 - seq_len
        
        return position_ids, rope_deltas

    def _run_with_io_binding(self, session, inputs_dict, device="cuda"):
        """
        Run inference (IO Binding temporarily disabled to ensure correct outputs).
        
        Args:
            session: ONNX Runtime InferenceSession
            inputs_dict: Dictionary of input name -> numpy array
            device: "cuda" or "cpu"
        
        Returns:
            list of numpy arrays
        """
        # Disable IO Binding temporarily to avoid garbage outputs
        return session.run(None, inputs_dict)

    def generate(
        self,
        image_path: str,
        prompt: str = "",
        max_new_tokens: int = 100,
        temperature: float = 0.7,
        top_p: float = 0.9,
    ) -> str:
        """
        Generate text from image.

        Args:
            image_path: Path to input image
            prompt: Optional text prompt
            max_new_tokens: Maximum number of tokens to generate
            temperature: Sampling temperature
            top_p: Top-p sampling parameter

        Returns:
            Generated text
        """
        print(f"\nGenerating for image: {image_path}")
        print(f"  Prompt: '{prompt}'")
        print(f"  Max tokens: {max_new_tokens}")
        print(f"  Device: {self.device}")

        # Step 1: Encode image
        print("\n[1/4] Encoding image...")
        start_time = time.time()
        image_features = self.encode_image(image_path)
        print(f"  Image features shape: {image_features.shape}")
        print(f"  Time: {time.time() - start_time:.2f}s")

        # Step 2: Prepare input
        print("\n[2/4] Preparing input...")
        start_time = time.time()

        # Load image for processor
        image = Image.open(image_path).convert("RGB")
        
        # Create messages for GLM-OCR chat template (same as transformers_infer.py)
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image_path},
                    {"type": "text", "text": prompt if prompt else "Describe this image."}
                ]
            }
        ]
        
        inputs = self.processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs.pop("token_type_ids", None)
        
        input_ids = inputs["input_ids"].numpy()
        attention_mask = inputs["attention_mask"].numpy()

        print(f"  Input IDs shape: {input_ids.shape}")
        print(f"  Time: {time.time() - start_time:.2f}s")

        # Step 3: Embedding
        print("\n[3/4] Getting embeddings...")
        start_time = time.time()

        image_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|image|>")
        input_ids_list = input_ids[0].tolist()
        
        # Get embeddings
        embed_outputs = self._run_with_io_binding(
            self.sessions["embedding"],
            {"input_ids": input_ids},
            device=self.device
        )
        inputs_embeds = embed_outputs[0]
        
        # Replace image token embeddings with actual image features
        image_positions = [i for i, tid in enumerate(input_ids_list) if tid == image_token_id]
        
        if len(image_positions) > 0:
            num_image_tokens = image_features.shape[0]
            
            if len(image_positions) == num_image_tokens:
                for i, pos in enumerate(image_positions):
                    inputs_embeds[0, pos] = image_features[i]
                print(f"  Replaced {num_image_tokens} image tokens")
            else:
                # Remove original <|image|> tokens from input_ids and get embeddings
                non_image_mask = np.array([tid != image_token_id for tid in input_ids_list])
                inputs_embeds = inputs_embeds[:, non_image_mask, :]
                
                # Also update attention_mask to remove original image token
                attention_mask = attention_mask[:, non_image_mask]
                
                boi_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|begin_of_image|>")
                if boi_token_id in input_ids_list:
                    boi_pos = input_ids_list.index(boi_token_id)
                    before = inputs_embeds[:, :boi_pos+1, :]
                    after = inputs_embeds[:, boi_pos+1:, :]
                    image_features_batch = image_features[np.newaxis, :, :]
                    inputs_embeds = np.concatenate([before, image_features_batch, after], axis=1)
                    
                    before_mask = attention_mask[:, :boi_pos+1]
                    image_mask = np.ones((1, num_image_tokens), dtype=np.int64)
                    after_mask = attention_mask[:, boi_pos+1:]
                    attention_mask = np.concatenate([before_mask, image_mask, after_mask], axis=1)
                    
                    print(f"  Inserted {num_image_tokens} image tokens")

        print(f"  Embeddings shape: {inputs_embeds.shape}")
        print(f"  Time: {time.time() - start_time:.2f}s")

        # Step 4: Prefill
        print("\n[4/4] Running inference...")
        start_time = time.time()

        seq_len = inputs_embeds.shape[1]
        
        # M-RoPE: Calculate position_ids with proper 3D positions for image tokens
        # We need to use the same logic as PyTorch's get_rope_index
        image_grid_thw = inputs.get("image_grid_thw")
        if image_grid_thw is not None:
            # Calculate position_ids using the same logic as PyTorch
            position_ids, rope_deltas = self._get_rope_index(
                input_ids[0].tolist(),
                image_grid_thw,
                attention_mask[0].tolist()
            )
            position_ids = position_ids[:, np.newaxis, :]
            print(f"  M-RoPE enabled: rope_deltas={rope_deltas}")
        else:
            # Fallback to simple position_ids
            position_ids = np.arange(seq_len, dtype=np.int64)
            position_ids = np.stack([position_ids, position_ids, position_ids], axis=0)
            position_ids = position_ids[:, np.newaxis, :]
            rope_deltas = 0

        prefill_inputs = {
            "inputs_embeds": inputs_embeds.astype(np.float32),
            "attention_mask": attention_mask.astype(np.int64),
            "position_ids": position_ids.astype(np.int64),
        }
        prefill_outputs = self._run_with_io_binding(
            self.sessions["prefill"],
            prefill_inputs,
            device=self.device
        )

        logits = prefill_outputs[0]
        past_key_values = prefill_outputs[1:]

        print(f"  Prefill logits shape: {logits.shape}")
        print(f"  KV cache tensors: {len(past_key_values)}")
        print(f"  Time: {time.time() - start_time:.2f}s")

        print(f"\n[5/5] Generating tokens...", flush=True)
        print(f"  DEBUG: seq_len={seq_len}, prefill positions=[0..{seq_len-1}]")
        generated_tokens = []
        
        decode_attention_mask = attention_mask.copy()

        for step in range(max_new_tokens):
            next_token_logits = logits[:, -1, :]
            next_token_id = int(np.argmax(next_token_logits, axis=-1)[0])
            generated_tokens.append(next_token_id)

            if step < 5:
                print(f"  DEBUG step={step}: token={next_token_id} ('{self.processor.tokenizer.decode([next_token_id])}')")

            if next_token_id in [self.processor.tokenizer.eos_token_id, 59253]:
                print(f"  EOS token reached at step {step + 1}")
                break

            # Update attention mask BEFORE decode (to match PyTorch behavior)
            decode_attention_mask = np.concatenate(
                [decode_attention_mask, np.ones((1, 1), dtype=np.int64)], axis=1
            )

            # Get next token embedding
            next_token_embeds = self._run_with_io_binding(
                self.sessions["embedding"],
                {"input_ids": np.array([[next_token_id]], dtype=np.int64)},
                device=self.device
            )[0]

            # Position IDs for M-RoPE: position = cache_position + rope_deltas
            # This ensures correct position encoding after image tokens
            cache_position = seq_len + step
            new_position = cache_position + rope_deltas
            decode_position_ids = np.full((3, 1, 1), new_position, dtype=np.int64)
            
            if step < 5:
                print(f"  DEBUG step={step}: cache_pos={cache_position}, rope_delta={rope_deltas}, position_id={new_position}")

            # Prepare decode inputs
            decode_inputs = {
                "inputs_embeds": next_token_embeds.astype(np.float32),
                "attention_mask": decode_attention_mask,
                "position_ids": decode_position_ids,
            }
            for layer_idx in range(16):
                decode_inputs[f"past_key_{layer_idx}"] = past_key_values[layer_idx * 2]
                decode_inputs[f"past_value_{layer_idx}"] = past_key_values[layer_idx * 2 + 1]

            # Run decode
            decode_outputs = self._run_with_io_binding(
                self.sessions["decode"],
                decode_inputs,
                device=self.device
            )

            logits = decode_outputs[0]
            past_key_values = decode_outputs[1:]

            if (step + 1) % 10 == 0:
                print(f"  Generated {step + 1} tokens...")

        print(f"\n  Total tokens generated: {len(generated_tokens)}")
        print(f"  Time: {time.time() - start_time:.2f}s")

        # Save full token sequence (input + generated) to file for comparison
        # Note: input_ids_list contains the original 237 tokens from processor
        # The actual tokens fed to prefill model may differ due to image token handling
        full_sequence = input_ids_list + generated_tokens
        with open("result_token_ids_onnx.txt", "w", encoding="utf-8") as f:
            f.write(f"ONNX Full Token IDs (including input)\n")
            f.write(f"Total: {len(full_sequence)} tokens\n")
            f.write(f"Input length: {len(input_ids_list)} tokens (from processor)\n")
            f.write(f"Prefill seq_len: {seq_len} tokens (actual embeddings fed to model)\n")
            f.write(f"Generated: {len(generated_tokens)} tokens\n")
            f.write("="*80 + "\n\n")
            f.write(f"Full sequence:\n")
            f.write(f"{full_sequence}\n\n")
            f.write(f"Input part (first {len(input_ids_list)}):\n")
            f.write(f"{input_ids_list}\n\n")
            f.write(f"Generated part (last {len(generated_tokens)}):\n")
            f.write(f"{generated_tokens}\n")
        print(f"  Full token IDs saved to result_token_ids_onnx.txt")

        generated_text = self.processor.tokenizer.decode(
            generated_tokens, skip_special_tokens=True
        )

        return generated_text
    
    def _remove_duplicate_branches(self, text: str) -> str:
        """
        Remove duplicate branches from LaTeX formula output.
        This fixes the issue where ONNX model generates repeated formula branches.
        """
        import re
        
        # Split by line breaks (\\ in LaTeX)
        lines = text.split('\\\\')
        
        seen = set()
        unique_lines = []
        
        for line in lines:
            # Normalize for comparison (remove extra spaces)
            normalized = re.sub(r'\s+', ' ', line.strip())
            
            if not normalized or normalized not in seen:
                if normalized:
                    seen.add(normalized)
                unique_lines.append(line)
        
        return '\\\\'.join(unique_lines)

    def generate_batch(
        self,
        image_paths: List[str],
        prompt: str = "",
        max_new_tokens: int = 100,
    ) -> List[str]:
        """
        Generate text for multiple images.

        Args:
            image_paths: List of image paths
            prompt: Optional text prompt
            max_new_tokens: Maximum number of tokens to generate

        Returns:
            List of generated texts
        """
        results = []
        for image_path in image_paths:
            text = self.generate(image_path, prompt, max_new_tokens)
            results.append(text)
        return results


def main():
    parser = argparse.ArgumentParser(description="GLM-OCR ONNX End-to-End Inference")
    parser.add_argument(
        "--onnx-dir",
        type=str,
        default=r"D:\models\onnx-v5\GLM-OCR",
        help="ONNX models directory",
    )
    parser.add_argument(
        "--image",
        type=str,
        default=None,
        help="Single image path",
    )
    parser.add_argument(
        "--prompt",
        type=str,
        default="Formula Recognition:",
        help="Text prompt",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=1024,
        help="Maximum tokens to generate",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cpu",
        choices=["cpu", "cuda"],
        help="Device to use",
    )

    args = parser.parse_args()

    # Get image paths
    if args.image:
        image_paths = [args.image]
    else:
        print("Error: --image must be specified")
        sys.exit(1)

    # Initialize inference
    inference = GLMOcrOnnxInference(
        onnx_dir=args.onnx_dir,
        device=args.device,
    )

    # Generate
    print("\n" + "=" * 60)
    print("GLM-OCR ONNX End-to-End Inference")
    print("=" * 60)

    results = inference.generate_batch(
        image_paths=image_paths,
        prompt=args.prompt,
        max_new_tokens=args.max_tokens,
    )

    # Print results
    print("\n" + "=" * 60)
    print("Results")
    print("=" * 60)

    for i, (image_path, text) in enumerate(zip(image_paths, results)):
        print(f"\nImage {i + 1}: {image_path}")
        print(f"Generated text:\n{text}")
        print("-" * 60)


if __name__ == "__main__":
    main()

GLM-OCR

👋 Join our WeChat and Discord community
📍 Use GLM-OCR's API

Introduction

GLM-OCR is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture. It introduces Multi-Token Prediction (MTP) loss and stable full-task reinforcement learning to improve training efficiency, recognition accuracy, and generalization. The model integrates the CogViT visual encoder pre-trained on large-scale image–text data, a lightweight cross-modal connector with efficient token downsampling, and a GLM-0.5B language decoder. Combined with a two-stage pipeline of layout analysis and parallel recognition based on PP-DocLayout-V3, GLM-OCR delivers robust and high-quality OCR performance across diverse document layouts.

Key Features

State-of-the-Art Performance: Achieves a score of 94.62 on OmniDocBench V1.5, ranking #1 overall, and delivers state-of-the-art results across major document understanding benchmarks, including formula recognition, table recognition, and information extraction.
Optimized for Real-World Scenarios: Designed and optimized for practical business use cases, maintaining robust performance on complex tables, code-heavy documents, seals, and other challenging real-world layouts.
Efficient Inference: With only 0.9B parameters, GLM-OCR supports deployment via vLLM, SGLang, and Ollama, significantly reducing inference latency and compute cost, making it ideal for high-concurrency services and edge deployments.
Easy to Use: Fully open-sourced and equipped with a comprehensive SDK and inference toolchain, offering simple installation, one-line invocation, and smooth integration into existing production pipelines.

Usage

vLLM

pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly

or using docker with:

docker pull vllm/vllm-openai:nightly

run with:

pip install git+https://github.com/huggingface/transformers.git
vllm serve zai-org/GLM-OCR  --allowed-local-media-path /  --port 8080

SGLang

using docker with:

docker pull lmsysorg/sglang:dev

or build it from source with:

pip install git+https://github.com/sgl-project/sglang.git#subdirectory=python

run with:

pip install git+https://github.com/huggingface/transformers.git
python -m sglang.launch_server --model zai-org/GLM-OCR --port 8080

Ollama

Download Ollama.
run with:

ollama run glm-ocr

Ollama will automatically use image file path when an image is dragged into the terminal:

ollama run glm-ocr Text Recognition: ./image.png

Transformers

pip install git+https://github.com/huggingface/transformers.git

from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

MODEL_PATH = "zai-org/GLM-OCR"
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "url": "test_image.png"
            },
            {
                "type": "text",
                "text": "Text Recognition:"
            }
        ],
    }
]
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype="auto",
    device_map="auto",
)
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)
inputs.pop("token_type_ids", None)
generated_ids = model.generate(**inputs, max_new_tokens=8192)
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
print(output_text)

Prompt Limited

GLM-OCR currently supports two types of prompt scenarios:

Document Parsing – extract raw content from documents. Supported tasks include:

{
    "text": "Text Recognition:",
    "formula": "Formula Recognition:",
    "table": "Table Recognition:"
}

Information Extraction – extract structured information from documents. Prompts must follow a strict JSON schema. For example, to extract personal ID information:

请按下列JSON格式输出图中信息:
{
    "id_number": "",
    "last_name": "",
    "first_name": "",
    "date_of_birth": "",
    "address": {
        "street": "",
        "city": "",
        "state": "",
        "zip_code": ""
    },
    "dates": {
        "issue_date": "",
        "expiration_date": ""
    },
    "sex": ""
}

⚠️ Note: When using information extraction, the output must strictly adhere to the defined JSON schema to ensure downstream processing compatibility.

GLM-OCR SDK

We provide an easy-to-use SDK for using GLM-OCR more efficiently and conveniently. please check our github to get more detail.

Acknowledgement

This project is inspired by the excellent work of the following projects and communities:

License

The GLM-OCR model is released under the MIT License.

The complete OCR pipeline integrates PP-DocLayoutV3 for document layout analysis, which is licensed under the Apache License 2.0. Users should comply with both licenses when using this project.

Downloads last month: 66

Model tree for ningpp/GLM-OCR

Base model

zai-org/GLM-OCR

Quantized

(11)

this model