Document Q&A System with OpenAI Integration
This tutorial shows how to build a document Q&A system that stores documents in Papr Memory and uses OpenAI to answer questions about them.
Prerequisites
Before you begin, you'll need:
- A Papr Memory API key
- An OpenAI API key with function calling ability
- Node.js installed
Implementation
1. Project Setup
Set up your project:
mkdir document-qa
cd document-qa
npm init -y
npm install express dotenv node-fetch openai multer
npm install @papr/memory
Create a .env
file:
PAPR_MEMORY_API_KEY=your_papr_api_key_here
OPENAI_API_KEY=your_openai_api_key_here
2. Document Processing and Q&A
Create app.js
:
import express from 'express';
import multer from 'multer';
import fs from 'fs';
import OpenAI from 'openai';
import { Papr } from '@papr/memory';
import { fileURLToPath } from 'url';
import { dirname } from 'path';
import path from 'path';
import dotenv from 'dotenv';
// Load environment variables
dotenv.config();
// Initialize OpenAI and Papr clients
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY
});
const papr = new Papr({
apiKey: process.env.PAPR_MEMORY_API_KEY
});
// Get directory of current module
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Set up Express app
const app = express();
app.use(express.json());
// Configure file storage for document uploads
const storage = multer.diskStorage({
destination: function (req, file, cb) {
const uploadDir = path.join(__dirname, 'uploads');
if (!fs.existsSync(uploadDir)) {
fs.mkdirSync(uploadDir, { recursive: true });
}
cb(null, uploadDir);
},
filename: function (req, file, cb) {
cb(null, Date.now() + '-' + file.originalname);
}
});
const upload = multer({ storage: storage });
// Document upload endpoint
app.post('/documents/upload', upload.single('document'), async (req, res) => {
try {
const file = req.file;
// Simple document processing - in production, use a more robust parser
const text = fs.readFileSync(file.path, 'utf8');
// Create unique document ID
const documentId = `doc_${Date.now().toString(36)}`;
// Chunk the document text (simple implementation)
const chunks = chunkText(text, 1000);
// Prepare memory items
const memories = chunks.map((chunk, index) => ({
content: chunk,
type: 'text',
metadata: {
document_id: documentId,
filename: file.originalname,
chunk_index: index,
total_chunks: chunks.length,
hierarchical_structures: `Documents/${file.originalname}`
}
}));
// Store document chunks in Papr Memory using batch endpoint
const batchResponse = await papr.memory.addBatch({
memories: memories,
batch_size: 10
});
console.log(`Document processed: ${batchResponse.total_successful}/${chunks.length} chunks stored`);
res.json({
documentId,
chunksProcessed: batchResponse.total_successful,
totalChunks: chunks.length
});
} catch (error) {
console.error('Error processing document:', error);
res.status(500).json({ error: error.message });
}
});
// Helper function to chunk text into manageable pieces
function chunkText(text, maxLength = 1000) {
const chunks = [];
let currentChunk = '';
// Split by paragraphs first
const paragraphs = text.split(/\n\s*\n/);
for (const paragraph of paragraphs) {
// If paragraph alone exceeds max length, split by sentences
if (paragraph.length > maxLength) {
const sentences = paragraph.split(/(?<=[.!?])\s+/);
for (const sentence of sentences) {
if (currentChunk.length + sentence.length <= maxLength) {
currentChunk += sentence + ' ';
} else {
chunks.push(currentChunk.trim());
currentChunk = sentence + ' ';
}
}
} else {
// Check if adding this paragraph exceeds max length
if (currentChunk.length + paragraph.length > maxLength) {
chunks.push(currentChunk.trim());
currentChunk = paragraph + '\n\n';
} else {
currentChunk += paragraph + '\n\n';
}
}
}
// Add the last chunk if not empty
if (currentChunk.trim()) {
chunks.push(currentChunk.trim());
}
return chunks;
}
// Document query endpoint
app.post('/documents/query', async (req, res) => {
try {
const { documentId, question } = req.body;
// Use OpenAI to process the query and decide if we need document access
const completion = await openai.chat.completions.create({
model: "gpt-3.5-turbo",
messages: [
{ role: "system", content: "You are a document assistant that helps users find information in their documents. You can use the searchDocument function to search for information." },
{ role: "user", content: question }
],
functions: [
{
name: "searchDocument",
description: "Search for information in a specific document",
parameters: {
type: "object",
properties: {
documentId: {
type: "string",
description: "The ID of the document to search"
},
searchQuery: {
type: "string",
description: "What to search for in the document"
}
},
required: ["documentId", "searchQuery"]
}
}
],
function_call: "auto"
});
const responseMessage = completion.choices[0].message;
// Check if function call was made
if (responseMessage.function_call && responseMessage.function_call.name === "searchDocument") {
const functionArguments = JSON.parse(responseMessage.function_call.arguments);
// Search the document in Papr Memory
const searchResponse = await papr.memory.search({
query: functionArguments.searchQuery,
metadata: {
document_id: documentId
},
max_memories: 30, // Retrieve more chunks if needed
enable_agentic_graph: true // Enable graph search for better results
});
let documentContent = '';
if (searchResponse.data && searchResponse.data.memories && searchResponse.data.memories.length > 0) {
// Sort by chunk index
const sortedMemories = searchResponse.data.memories.sort((a, b) => {
return (a.metadata?.chunk_index || 0) - (b.metadata?.chunk_index || 0);
});
// Combine content from relevant chunks
documentContent = sortedMemories
.map(memory => memory.content)
.join('\n\n');
}
// Send the search results back to OpenAI
const secondCompletion = await openai.chat.completions.create({
model: "gpt-3.5-turbo",
messages: [
{ role: "system", content: "You help users find information in documents. Answer the question using ONLY the provided document content. If the answer cannot be found in the content, say so clearly." },
{ role: "user", content: question },
{ role: "system", content: `Document content: ${documentContent}` }
]
});
const answer = secondCompletion.choices[0].message.content;
res.json({
answer,
documentId
});
} else {
// OpenAI didn't call the function, just return its response
res.json({
answer: responseMessage.content,
documentId
});
}
} catch (error) {
console.error('Error processing query:', error);
res.status(500).json({ error: error.message });
}
});
// Serve a simple frontend
app.use(express.static('public'));
// Create the public directory and a basic HTML interface
fs.mkdirSync('public', { recursive: true });
fs.writeFileSync('public/index.html', `
<!DOCTYPE html>
<html>
<head>
<title>Document Q&A System</title>
<style>
body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
.container { display: flex; gap: 20px; }
.left, .right { flex: 1; }
form { margin-bottom: 20px; border: 1px solid #ccc; padding: 15px; border-radius: 4px; }
input, textarea, button { width: 100%; margin-bottom: 10px; padding: 8px; }
button { background-color: #4CAF50; color: white; border: none; cursor: pointer; }
#answer { white-space: pre-wrap; border: 1px solid #ddd; padding: 15px; min-height: 100px; }
</style>
</head>
<body>
<h1>Document Q&A System</h1>
<div class="container">
<div class="left">
<h2>Upload Document</h2>
<form id="uploadForm">
<input type="file" id="document" name="document" required>
<button type="submit">Upload</button>
</form>
<div id="uploadResult"></div>
</div>
<div class="right">
<h2>Ask Questions</h2>
<form id="questionForm">
<input type="text" id="documentId" placeholder="Document ID" required>
<textarea id="question" placeholder="Ask a question about the document" required></textarea>
<button type="submit">Submit Question</button>
</form>
<h3>Answer:</h3>
<div id="answer"></div>
</div>
</div>
<script>
// Upload document
document.getElementById('uploadForm').addEventListener('submit', async (e) => {
e.preventDefault();
const formData = new FormData();
const fileInput = document.getElementById('document');
formData.append('document', fileInput.files[0]);
try {
const response = await fetch('/documents/upload', {
method: 'POST',
body: formData
});
const result = await response.json();
document.getElementById('uploadResult').textContent =
`Document uploaded! Document ID: ${result.documentId}`;
document.getElementById('documentId').value = result.documentId;
} catch (error) {
document.getElementById('uploadResult').textContent =
`Error: ${error.message}`;
}
});
// Ask question
document.getElementById('questionForm').addEventListener('submit', async (e) => {
e.preventDefault();
const documentId = document.getElementById('documentId').value;
const question = document.getElementById('question').value;
try {
const response = await fetch('/documents/query', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ documentId, question })
});
const result = await response.json();
document.getElementById('answer').textContent = result.answer;
} catch (error) {
document.getElementById('answer').textContent = `Error: ${error.message}`;
}
});
</script>
</body>
</html>
`);
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`Server running on http://localhost:${PORT}`);
});
3. Update package.json
Update your package.json to include:
{
"name": "document-qa",
"version": "1.0.0",
"description": "Document Q&A system with Papr Memory and OpenAI",
"main": "app.js",
"type": "module",
"scripts": {
"start": "node app.js"
},
"dependencies": {
"@papr/memory": "^1.0.0",
"dotenv": "^16.3.1",
"express": "^4.18.2",
"multer": "^1.4.5-lts.1",
"openai": "^4.0.0"
}
}
Usage
- Start the server:
npm start
Open your browser to
http://localhost:3000
Upload a document using the form on the left
Copy the Document ID to the form on the right
Ask questions about the document
How It Works
Document Processing Flow:
- User uploads a document
- System extracts text and chunks the document into smaller pieces
- Each chunk is stored in Papr Memory with metadata using the batch API
- Document ID is returned to the user
Question Answering Flow:
- User asks a question about a specific document
- OpenAI determines if document search is needed
- System searches Papr Memory for relevant content using the document_id
- OpenAI receives the document content and generates an answer
- Answer is returned to the user
Function Calling Integration:
- OpenAI uses function calling to trigger document searches
- The search function retrieves relevant document chunks
- OpenAI generates answers based on the retrieved content
Next Steps
- Improve document text extraction (use specialized libraries for different formats)
- Enhance document chunking (semantic paragraphs)
- Add document categorization and tagging
- Implement user authentication
For more information, check out the Papr Memory API documentation and explore other tutorials in our documentation.