nirabo
diff --git a/‎EXT_README.md
Lines changed: 24 additions & 4 deletions b/‎EXT_README.md
Lines changed: 24 additions & 4 deletions
diff --git a/‎PDF2MD_README.md
Lines changed: 145 additions & 0 deletions b/‎PDF2MD_README.md
Lines changed: 145 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 29 additions & 4 deletions b/‎README.md
Lines changed: 29 additions & 4 deletions
diff --git a/‎Troubleshooting.md
Lines changed: 3 additions & 0 deletions b/‎Troubleshooting.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎assets/invoice_test.pdf
57.9 KB b/‎assets/invoice_test.pdf
57.9 KB
diff --git a/‎assets/pdf2markdown.png
2.81 MB b/‎assets/pdf2markdown.png
2.81 MB
@@ -52,7 +52,7 @@ python -m docext.app.app
 python -m docext.app.app --model_name "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ" --max_img_size 1024 # `--help` for more options
 ```
 
-The interface will be available at `http://localhost:7860` with default credentials: (You can change the port by using `--ui_port` flag)
+The interface will be available at `http://localhost:7860` with default credentials: (You can change the port by using `--server_port` flag)
 
 - Username: `admin`
 - Password: `admin`
@@ -71,6 +71,7 @@ python -m docext.app.app --concurrency_limit 10
 import pandas as pd
 import concurrent.futures
 from gradio_client import Client, handle_file
+from docext.core.file_converters.pdf_converter import PDFConverter
 
 
 def dataframe_to_custom_dict(df: pd.DataFrame) -> dict:
@@ -110,6 +111,12 @@ fields_and_tables = dataframe_to_custom_dict(pd.DataFrame([
     {"name": "item_description", "type": "table", "description": "Item/Product description"}
     # add more fields and table columns as needed
 ]))
+# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
+CLIENT_URL = "http://localhost:7860"
+
+
+
+## ======= Image Inputs =======
 
 file_inputs = [
     {
@@ -119,21 +126,34 @@ file_inputs = [
 ]
 
 ## send single request
-### client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
 fields_df, tables_df = get_extracted_fields_and_tables(
-    "http://localhost:7860", "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
+    CLIENT_URL, "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
 )
 print("========Fields:=========")
 print(fields_df)
 print("========Tables:=========")
 print(tables_df)
 
 
+## ======= PDF Inputs =======
+
+pdf_converter = PDFConverter()
+document_pages = pdf_converter.convert_and_save_images("assets/invoice_test.pdf")
+file_inputs = [{"image": handle_file(page)} for page in document_pages]
+
+fields_df, tables_df = get_extracted_fields_and_tables(
+    CLIENT_URL, "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
+)
+print("========Fields:=========")
+print(fields_df)
+print("========Tables:=========")
+print(tables_df)
+
 ## send multiple requests in parallel
 # Define a wrapper function for parallel execution
 def run_request():
     return get_extracted_fields_and_tables(
-        "http://localhost:7860", "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
+        CLIENT_URL, "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
     )
 
 # Use ThreadPoolExecutor to send 10 requests in parallel
 
@@ -0,0 +1,145 @@
+# PDF to Markdown API Documentation
+
+Convert PDF documents and images to high-quality markdown format using vision-language models.
+
+## Table of Contents
+- [Features](#features)
+- [Getting Started](#getting-started)
+  - [Quickstart](#quickstart)
+  - [Installation](#installation)
+  - [Web Interface](#web-interface)
+  - [API Access](#api-access)
+- [Requirements](#requirements)
+- [Supported Models & Platforms](#supported-models--platforms)
+  - [Models with vLLM (Linux)](#models-with-vllm-linux)
+
+## Features
+
+- **LaTeX Equation Recognition**: Convert both inline and block LaTeX equations in images to markdown.
+- **Intelligent Image Description**: Generate a detailed description for all images in the document within `<img></img>` tags.
+- **Signature Detection**: Detect and mark signatures and watermarks in the document. Signatures text are extracted within `<signature></signature>` tags.
+- **Watermark Detection**: Detect and mark watermarks in the document. Watermarks text are extracted within `<watermark></watermark>` tags.
+- **Page Number Detection**: Detect and mark page numbers in the document. Page numbers are extracted within `<page_number></page_number>` tags.
+- **Checkboxes and Radio Buttons**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, ☑, ☒).
+- **Table Detection**: Convert complex tables into html tables.
+
+## Getting Started
+### Quickstart
+- [Colab notebook for onprem deployment](https://colab.research.google.com/drive/1uKO70sctH8G59yYH_rLW6CPK4Vj2YmI6?usp=sharing)
+
+### Installation
+```bash
+# create a virtual environment
+## install uv if not installed
+curl -LsSf https://astral.sh/uv/install.sh | sh
+## create a virtual environment with python 3.11
+uv venv --python=3.11
+source .venv/bin/activate
+
+# Install from PyPI
+uv pip install docext
+
+# Or install from source
+git clone https://github.com/nanonets/docext.git
+cd docext
+uv pip install -e .
+```
+
+### Web Interface
+
+docext includes a Gradio-based web interface for easy document processing:
+
+```bash
+# Start the web interface with default configs
+python -m docext.app.app --model_name hosted_vllm/nanonets/Nanonets-OCR-s
+
+# Start the web interface with custom configs
+python -m docext.app.app --model_name hosted_vllm/nanonets/Nanonets-OCR-s --max_img_size 1024 --concurrency_limit 16 # `--help` for more options
+```
+
+The interface will be available at `http://localhost:7860` with default credentials: (You can change the port by using `--ui_port` flag)
+
+- Username: `admin`
+- Password: `admin`
+
+Check [Supported Models]() section for more options for the model.
+
+### API Access
+
+```python
+import time
+from gradio_client import Client, handle_file
+
+def convert_pdf_to_markdown(
+    client_url: str,
+    username: str,
+    password: str,
+    file_paths: list[str],
+    model_name: str = "hosted_vllm/nanonets/Nanonets-OCR-s"
+):
+    """
+    Convert PDF/images to markdown using the API
+
+    Args:
+        client_url: URL of the docext server
+        username: Authentication username
+        password: Authentication password
+        file_paths: List of file paths to convert
+        model_name: Model to use for conversion
+
+    Returns:
+        str: Converted markdown content
+    """
+    client = Client(client_url, auth=(username, password))
+
+    # Prepare file inputs
+    file_inputs = [{"image": handle_file(file_path)} for file_path in file_paths]
+
+    # Convert to markdown (non-streaming)
+    result = client.predict(
+        images=file_inputs,
+        api_name="/process_markdown_streaming"
+    )
+
+    return result
+
+# Example usage
+# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
+CLIENT_URL = "http://localhost:7860"
+
+# Single image conversion
+markdown_content = convert_pdf_to_markdown(
+    CLIENT_URL,
+    "admin",
+    "admin",
+    ["assets/invoice_test.pdf"]
+)
+print(markdown_content)
+
+# Multiple files conversion
+markdown_content = convert_pdf_to_markdown(
+    CLIENT_URL,
+    "admin",
+    "admin",
+    ["assets/invoice_test.jpeg", "assets/invoice_test.pdf"]
+)
+print(markdown_content)
+```
+## Requirements
+
+- Python 3.11+
+- CUDA-compatible GPU (for optimal performance). Use Google Colab for free GPU access.
+- Dependencies listed in requirements.txt
+
+## Supported Models & Platforms
+### Models with vLLM (Linux)
+
+We recommend using the `hosted_vllm/nanonets/Nanonets-OCR-s` model for best performance. The model is trained to do OCR with semantic tagging. But, you can use any other VLM models supported by vLLM. Also, it is a 3B model, so it can run on a GPUs with small VRAM.
+
+Examples:
+| Model | `--model_name` |
+|-------|--------------|
+| Nanonets-OCR-s | `hosted_vllm/nanonets/Nanonets-OCR-s` |
+| Qwen/Qwen2.5-VL-7B-Instruct-AWQ | `hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ` |
+| Qwen/Qwen2.5-VL-7B-Instruct | `hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct` |
+| Qwen/Qwen2.5-VL-32B-Instruct | `hosted_vllm/Qwen/Qwen2.5-VL-32B-Instruct` |
@@ -18,18 +18,42 @@
   </a>
 </p>
 
-![Demo Docext](https://raw.githubusercontent.com/NanoNets/docext/main/assets/demo.jpg)
+<!-- ![Demo Docext](https://raw.githubusercontent.com/NanoNets/docext/main/assets/pdf2markdown.jpg) -->
+![Demo Docext](assets/pdf2markdown.png)
 
 
+## New Model Release: Nanonets-OCR-s
+
+**We're excited to announce the release of Nanonets-OCR-s, a compact 3B parameter model specifically trained for efficient image to markdown conversion with semantic understanding for images, signatures, watermarks, etc.!**
+
+  📢 [Read the full announcement](https://nanonets.com/research/nanonets-ocr-s) | 🤗 [Hugging Face model](https://huggingface.co/nanonets/Nanonets-OCR-s)
 
 ## Overview
 
-docext is an OCR-free tool for extracting structured information from documents such as invoices, passports, and other documents. It leverages vision-language models (VLMs) to accurately identify and extract both field data and tabular information from document images.
+docext is a comprehensive on-premises document intelligence toolkit powered by vision-language models (VLMs). It provides three core capabilities:
+
+**📄 PDF & Image to Markdown Conversion**: Transform documents into structured markdown with intelligent content recognition, including LaTeX equations, signatures, watermarks, tables, and semantic tagging.
 
-The [Intelligent Document Processing Leaderboard](https://idp-leaderboard.org/) tracks and evaluates performance vision-language models across OCR, Key Information Extraction (KIE), document classification, table extraction, and other intelligent document processing tasks.
+**🔍 Document Information Extraction**: OCR-free extraction of structured information (fields, tables, etc.) from documents such as invoices, passports, and other document types, with confidence scoring.
+
+**📊 Intelligent Document Processing Leaderboard**: A comprehensive benchmarking platform that tracks and evaluates vision-language model performance across OCR, Key Information Extraction (KIE), document classification, table extraction, and other intelligent document processing tasks.
 
 
 ## Features
+### PDF and Image to Markdown
+Convert both PDF and images to markdown with content recognition and semantic tagging.
+- **LaTeX Equation Recognition**: Convert both inline and block LaTeX equations in images to markdown.
+- **Intelligent Image Description**: Generate a detailed description for all images in the document within `<img></img>` tags.
+- **Signature Detection**: Detect and mark signatures and watermarks in the document. Signatures text are extracted within `<signature></signature>` tags.
+- **Watermark Detection**: Detect and mark watermarks in the document. Watermarks text are extracted within `<watermark></watermark>` tags.
+- **Page Number Detection**: Detect and mark page numbers in the document. Page numbers are extracted within `<page_number></page_number>` tags.
+- **Checkboxes and Radio Buttons**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, ☑, ☒).
+- **Table Detection**: Convert complex tables into html tables.
+
+🔍 For in-depth information, see the [release blog](https://github.com/NanoNets/docext/tree/main/docext/benchmark).
+
+For setup instructions and additional details, check out the full feature guide for the [pdf to markdown](https://github.com/NanoNets/docext/blob/main/PDF2MD_README.md).
+
 ### Intelligent Document Processing Leaderboard
 This benchmark evaluates performance across seven key document intelligence challenges:
 
@@ -64,13 +88,14 @@ For more details (Installation, Usage, and so on), please check out the [feature
 ## Change Log
 
 ### Latest Updates
+- **12-06-2025** - Added pdf and image to markdown support.
 - **06-06-2025** - Added `gemini-2.5-pro-preview-06-05` evaluation metrics to the leaderboard.
 - **04-06-2025** - Added support for PDF and multiple documents in `docext` extraction.
-- **23-05-2025** – Added `gemini-2.5-pro-preview-03-25`, `claude-sonnet-4` evaluation metrics to the leaderboard.
 
 <details>
 <summary>Older Changes</summary>
 
+- **23-05-2025** – Added `gemini-2.5-pro-preview-03-25`, `claude-sonnet-4` evaluation metrics to the leaderboard.
 - **17-05-2025** – Added `InternVL3-38B-Instruct`, `qwen2.5-vl-32b-instruct` evaluation metrics to the leaderboard.
 - **16-05-2025** – Added `gemma-3-27b-it` evaluation metrics to the leaderboard.
 - **12-05-2025** – Added `Claude 3.7 sonnet`, `mistral-medium-3` evaluation metrics to the leaderboard.
 
@@ -13,3 +13,6 @@
 ### 3. `RuntimeError: Failed to infer device type`
 - This error occurs when CUDA drivers are not installed, affecting vLLM.
 - Follow the troubleshooting guide [here](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#failed-to-infer-device-type).
+
+### 4. `ValueError: Bfloat16 is only supported on GPUs with compute capability of at least 8.0. Your Tesla T4 GPU has compute capability 7.5. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.`
+- Use `--dtype=float16` instead of `--dtype=bfloat16`.