Meilisearch as documentation retrieval generator

This document demonstrates how to use Meilisearch as document storage for a simplified RAG (Retrieval-Augmented Generation) system.

How it works

Documents are stored in Meilisearch as a searchable index
LiveHelperChat searches using AI-generated queries. The AI extracts important keywords before performing the Meilisearch query
The LLM generates responses based on the retrieved context from relevant documents

Setup

REST API Configuration

In this example, we'll use the Gemini streaming version - REST API. Don't forget to change YOUR_API_KEY
MeiliSearch REST API configuration - REST API. Don't forget to change MEILI_API_KEY

Bot Configuration

Import the MeiliSearch bot configuration - Bot JSON. When importing the bot, choose GeminiStream

Running Meilisearch

Create a docker-compose.yml file with the following content:

version: '3'

services:
  meilisearch:
    image: getmeili/meilisearch:latest
    ports:
      - "7700:7700"
    volumes:
      - ./meili_data:/meili_data
    environment:
        MEILI_API_KEY: "xxxxxxxxxxxxxxxxxxxx"
        MEILI_MASTER_KEY: "xxxxxxxxxxxxxxxxxxxx"
        MEILI_ENV: "production"
    restart: unless-stopped

Run the container with:

docker-compose -f docker-compose.yml up -d

Storing documents in Meilisearch

In this example, I'll be using a generated markdown file from https://github.com/LiveHelperChat/crawler-to-md

Sample PHP script command:

cd /meili && /usr/bin/php ./command.php MEILI_API_KEY lhc_doc /path/to/output/doc_livehelperchat_com/httpsdoc.livehelperchat.com.md

Script content:

<?php
/**
 * Meilisearch document indexer script (PHP version)
 * Usage: php command.php <master_key> <index_name> <path_to_md_file>
 */

// Check if all required arguments are provided
if ($argc !== 4) {
    echo "Usage: php {$argv[0]} <master_key> <index_name> <path_to_md_file>\n";
    echo "Example: php {$argv[0]} your_master_key lhc_docs ./sample.md\n";
    exit(1);
}

$masterKey = $argv[1];
$indexName = $argv[2];
$mdFilePath = $argv[3];
$meilisearchUrl = "http://localhost:7700";

// Check if the markdown file exists
if (!file_exists($mdFilePath)) {
    echo "Error: File {$mdFilePath} not found!\n";
    exit(1);
}

/**
 * Extract metadata from HTML comment
 */
function extractMetadata($content) {
    $metadata = [
        'url' => '',
        'title' => '',
        'hostname' => '',
        'description' => '',
        'sitename' => '',
        'date' => '',
        'filedate' => '',
        'categories' => '[]',
        'tags' => '[]'
    ];

    // Extract URL
    if (preg_match('/URL: (\S+)/', $content, $matches)) {
        $metadata['url'] = $matches[1];
    }

    // Extract title
    if (preg_match('/title: (.+)/', $content, $matches)) {
        $metadata['title'] = trim($matches[1]);
    }

    // Extract hostname
    if (preg_match('/hostname: (\S+)/', $content, $matches)) {
        $metadata['hostname'] = $matches[1];
    }

    // Extract description
    if (preg_match('/description: (.+)/', $content, $matches)) {
        $metadata['description'] = trim($matches[1]);
    }

    // Extract sitename
    if (preg_match('/sitename: (\S+)/', $content, $matches)) {
        $metadata['sitename'] = $matches[1];
    }

    // Extract date
    if (preg_match('/date: (\S+)/', $content, $matches)) {
        $metadata['date'] = $matches[1];
    }

    // Extract filedate
    if (preg_match('/filedate: (\S+)/', $content, $matches)) {
        $metadata['filedate'] = $matches[1];
    }

    // Extract categories
    if (preg_match('/categories: (\[.*?\])/', $content, $matches)) {
        $metadata['categories'] = $matches[1];
    }

    // Extract tags
    if (preg_match('/tags: (\[.*?\])/', $content, $matches)) {
        $metadata['tags'] = $matches[1];
    }

    return $metadata;
}

/**
 * Clean and prepare content
 */
function cleanContent($content) {
    // Remove HTML comments and metadata
    $content = preg_replace('/^<!--$.+?^-->$/ms', '', $content);
    // Remove --- separators
    $content = preg_replace('/^---$/m', '', $content);
    // Trim leading/trailing whitespace
    return trim($content);
}

/**
 * Create JSON document
 */
function createJsonDocument($id, $metadata, $content) {
    // Validate and decode JSON arrays
    $categories = json_decode($metadata['categories']);
    $tags = json_decode($metadata['tags']);

    if ($categories === null) {
        $categories = [];
    }
    if ($tags === null) {
        $tags = [];
    }

    return [
        'id' => $id,
        'url' => $metadata['url'],
        'title' => $metadata['title'],
        'hostname' => $metadata['hostname'],
        'description' => $metadata['description'],
        'sitename' => $metadata['sitename'],
        'date' => $metadata['date'],
        'filedate' => $metadata['filedate'],
        'categories' => $categories,
        'tags' => $tags,
        'content' => $content
    ];
}

/**
 * Make HTTP request to Meilisearch
 */
function makeRequest($url, $method, $data = null, $masterKey = '') {
    $ch = curl_init();

    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);
    curl_setopt($ch, CURLOPT_HTTPHEADER, [
        'Authorization: Bearer ' . $masterKey,
        'Content-Type: application/json'
    ]);

    if ($data !== null) {
        curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
    }

    $response = curl_exec($ch);
    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    return ['response' => $response, 'http_code' => $httpCode];
}

echo "Starting document indexing process...\n";
echo "Meilisearch URL: {$meilisearchUrl}\n";
echo "Index: {$indexName}\n";
echo "File: {$mdFilePath}\n";

// Create index if it doesn't exist
echo "Creating/updating index...\n";
$indexData = [
    'uid' => $indexName,
    'primaryKey' => 'id'
];
makeRequest("{$meilisearchUrl}/indexes", 'POST', $indexData, $masterKey);

// Configure searchable attributes
echo "Configuring searchable attributes...\n";
$searchableAttributes = ['title', 'content', 'description', 'url'];
makeRequest("{$meilisearchUrl}/indexes/{$indexName}/settings/searchable-attributes", 'PUT', $searchableAttributes, $masterKey);

// Configure filterable attributes
echo "Configuring filterable attributes...\n";
$filterableAttributes = ['hostname', 'sitename', 'date', 'filedate', 'categories', 'tags'];
makeRequest("{$meilisearchUrl}/indexes/{$indexName}/settings/filterable-attributes", 'PUT', $filterableAttributes, $masterKey);

// Read the entire file
echo "Processing documents...\n";
$fileContent = file_get_contents($mdFilePath);

// Create temporary directory for processing
$scriptDir = dirname(__FILE__);
$tempDir = $scriptDir . '/temp_docs_php';
if (!is_dir($tempDir)) {
    mkdir($tempDir, 0755, true);
}

// Split the file by HTML comments
$documents = [];
$parts = preg_split('/^<!--$/m', $fileContent);

foreach ($parts as $index => $part) {
    if ($index === 0) continue; // Skip the part before the first comment

    // Find the end of the comment and the content
    if (preg_match('/^(.+?)^-->(.*)$/ms', $part, $matches)) {
        $comment = '<!--' . $matches[1] . '-->';
        $content = $comment . $matches[2];
        $documents[] = $content;
    }
}

// Process each document
$docCount = 0;
$batchDocs = [];
$batchSize = 10;
$currentDocIds = []; // Track current document IDs

foreach ($documents as $docIndex => $docContent) {
    echo "Processing document " . ($docIndex + 1) . "\n";

    // Extract metadata
    $metadata = extractMetadata($docContent);

    // Skip if no URL (invalid document)
    if (empty($metadata['url'])) {
        echo "Skipping document with no URL\n";
        continue;
    }

    // Clean content
    $cleanContent = cleanContent($docContent);

    // Generate unique ID from URL
    $docId = preg_replace('/[^a-zA-Z0-9]/', '_', $metadata['url']);
    $docId = preg_replace('/_+/', '_', $docId);
    $docId = trim($docId, '_');

    // Track this document ID
    $currentDocIds[] = $docId;

    // Create JSON document
    try {
        $jsonDoc = createJsonDocument($docId, $metadata, $cleanContent);
        $batchDocs[] = $jsonDoc;
        $docCount++;

        // Send batch every 10 documents or if this is the last document
        if (count($batchDocs) >= $batchSize || $docIndex === count($documents) - 1) {
            echo "Sending batch of " . count($batchDocs) . " documents to Meilisearch...\n";

            $result = makeRequest("{$meilisearchUrl}/indexes/{$indexName}/documents", 'POST', $batchDocs, $masterKey);
            echo "Response: " . $result['response'] . "\n";

            // Reset batch
            $batchDocs = [];
        }
    } catch (Exception $e) {
        echo "Error processing document: " . $e->getMessage() . "\n";
        continue;
    }
}

echo "Document indexing completed!\n";
echo "Total documents processed: {$docCount}\n";
echo "Index: {$indexName}\n";

// Remove old documents that no longer exist
echo "Checking for old documents to remove...\n";
$allDocsResult = makeRequest("{$meilisearchUrl}/indexes/{$indexName}/documents?limit=10000", 'GET', null, $masterKey);
if ($allDocsResult['http_code'] === 200) {
    $allDocsResponse = json_decode($allDocsResult['response'], true);
    $existingDocs = $allDocsResponse['results'] ?? [];

    $docsToDelete = [];
    foreach ($existingDocs as $doc) {
        if (!in_array($doc['id'], $currentDocIds)) {
            $docsToDelete[] = $doc['id'];
        }
    }

    if (!empty($docsToDelete)) {
        echo "Removing " . count($docsToDelete) . " old documents...\n";
        $deleteResult = makeRequest("{$meilisearchUrl}/indexes/{$indexName}/documents/delete-batch", 'POST', $docsToDelete, $masterKey);
        echo "Delete response: " . $deleteResult['response'] . "\n";
    } else {
        echo "No old documents to remove.\n";
    }
} else {
    echo "Warning: Could not retrieve existing documents for cleanup.\n";
}

// Get index stats
echo "Getting index statistics...\n";
$statsResult = makeRequest("{$meilisearchUrl}/indexes/{$indexName}/stats", 'GET', null, $masterKey);
$stats = json_decode($statsResult['response'], true);
echo json_encode($stats, JSON_PRETTY_PRINT) . "\n";

// Clean up temporary directory
if (is_dir($tempDir)) {
    $files = glob($tempDir . '/*');
    foreach ($files as $file) {
        if (is_file($file)) {
            unlink($file);
        }
    }
    rmdir($tempDir);
}
?>

How it works​

Setup

REST API Configuration​

Bot Configuration​

Running Meilisearch​

Storing documents in Meilisearch​

How it works

REST API Configuration

Bot Configuration

Running Meilisearch

Storing documents in Meilisearch