Building a Legal Document Analysis Agent
This end-to-end tutorial demonstrates how to build a legal document analysis agent that processes contracts, extracts entities, and uses both user memories (contracts) and agent memories (legal reasoning) for intelligent analysis.
What We'll Build
A legal document analysis system that:
- Defines a custom legal schema for contracts, parties, and obligations
- Uploads and processes legal documents with schema-guided extraction
- Queries contract information with natural language
- Analyzes contracts with GraphQL for insights
- Stores the agent's own legal reasoning as memories for future improvement
Prerequisites
- Papr Memory API key (Get one here)
- Python 3.8+ or Node.js 16+
- Sample legal documents (PDFs or Word docs)
Step 1: Define Legal Schema
First, create a schema that defines the entities and relationships in legal contracts.
from papr_memory import Papr
import os
client = Papr(x_api_key=os.environ.get("PAPR_MEMORY_API_KEY"))
# Define legal contract schema
legal_schema = client.schemas.create(
name="Legal Contract Schema",
description="Schema for analyzing legal contracts, parties, obligations, and terms",
version="1.0.0",
node_types={
"Contract": {
"name": "Contract",
"label": "Contract",
"description": "Legal contract or agreement document",
"properties": {
"title": {
"type": "string",
"required": True,
"description": "Full contract title as it appears in the document"
},
"contract_type": {
"type": "string",
"required": True,
"description": "Type of legal contract",
"enum_values": ["service", "employment", "nda", "partnership", "license"]
},
"value": {
"type": "float",
"required": False,
"description": "Total contract value in USD"
},
"effective_date": {
"type": "datetime",
"required": False,
"description": "Date when contract becomes effective"
},
"expiration_date": {
"type": "datetime",
"required": False,
"description": "Date when contract expires"
},
"status": {
"type": "string",
"required": False,
"description": "Current status of the contract",
"enum_values": ["draft", "active", "expired", "terminated"],
"default": "active"
},
"jurisdiction": {
"type": "string",
"required": False,
"description": "Legal jurisdiction governing the contract"
}
},
"required_properties": ["title", "contract_type"],
"unique_identifiers": ["title"],
"color": "#e74c3c"
},
"Party": {
"name": "Party",
"label": "Party",
"description": "Legal party to a contract (person or organization)",
"properties": {
"name": {
"type": "string",
"required": True,
"description": "Full legal name of the party"
},
"party_type": {
"type": "string",
"required": False,
"description": "Type of party",
"enum_values": ["individual", "company", "government", "nonprofit"]
},
"role": {
"type": "string",
"required": False,
"description": "Role in the contract",
"enum_values": ["client", "vendor", "employee", "partner", "licensor", "licensee"]
},
"address": {
"type": "string",
"required": False,
"description": "Legal address of the party"
}
},
"required_properties": ["name"],
"unique_identifiers": ["name"],
"color": "#3498db"
},
"Obligation": {
"name": "Obligation",
"label": "Obligation",
"description": "Contractual obligation or responsibility",
"properties": {
"description": {
"type": "string",
"required": True,
"description": "Detailed description of the obligation"
},
"deadline": {
"type": "datetime",
"required": False,
"description": "Deadline for fulfilling the obligation"
},
"status": {
"type": "string",
"required": False,
"description": "Current status of the obligation",
"enum_values": ["pending", "in_progress", "completed", "overdue"],
"default": "pending"
},
"penalty": {
"type": "string",
"required": False,
"description": "Penalty for non-compliance"
}
},
"required_properties": ["description"],
"unique_identifiers": [],
"color": "#f39c12"
},
"Term": {
"name": "Term",
"label": "Term",
"description": "Specific term or condition in the contract",
"properties": {
"clause_number": {
"type": "string",
"required": False,
"description": "Clause or section number in the contract"
},
"description": {
"type": "string",
"required": True,
"description": "Description of the term or condition"
},
"term_type": {
"type": "string",
"required": False,
"description": "Type of contract term",
"enum_values": ["payment", "termination", "confidentiality", "liability", "warranty"]
}
},
"required_properties": ["description"],
"unique_identifiers": [],
"color": "#9b59b6"
}
},
relationship_types={
"PARTY_TO": {
"name": "PARTY_TO",
"label": "Party To",
"description": "Party is a signatory to the contract",
"allowed_source_types": ["Party"],
"allowed_target_types": ["Contract"],
"cardinality": "many-to-many",
"color": "#2ecc71"
},
"HAS_OBLIGATION": {
"name": "HAS_OBLIGATION",
"label": "Has Obligation",
"description": "Contract or party has an obligation",
"allowed_source_types": ["Contract", "Party"],
"allowed_target_types": ["Obligation"],
"cardinality": "one-to-many",
"color": "#e67e22"
},
"HAS_TERM": {
"name": "HAS_TERM",
"label": "Has Term",
"description": "Contract includes a specific term",
"allowed_source_types": ["Contract"],
"allowed_target_types": ["Term"],
"cardinality": "one-to-many",
"color": "#1abc9c"
},
"OBLIGATED_TO": {
"name": "OBLIGATED_TO",
"label": "Obligated To",
"description": "Party has obligation towards another party",
"allowed_source_types": ["Party"],
"allowed_target_types": ["Party"],
"properties": {
"obligation_description": {
"type": "string",
"required": False
}
},
"cardinality": "many-to-many",
"color": "#34495e"
}
}
)
schema_id = legal_schema.data.id
print(f"Legal schema created: {schema_id}")Step 2: Upload Legal Documents
Now upload legal documents using the schema for guided extraction.
# Upload a service agreement with schema-guided extraction
doc_response = client.document.upload(
file=open("service_agreement.pdf", "rb"),
schema_id=schema_id,
simple_schema_mode=True, # Use system + this schema only
hierarchical_enabled=True, # Preserve document structure
property_overrides=[
{
"nodeLabel": "Contract",
"set": {
"status": "active",
"department": "legal",
"review_required": False
}
}
]
)
upload_id = doc_response.document_status.upload_id
print(f"Document uploaded: {upload_id}")
# Monitor processing status
import time
while True:
status = client.document.get_status(upload_id)
print(f"Progress: {status.progress * 100}% - Page {status.current_page}/{status.total_pages}")
if status.status_type in ["completed", "failed", "cancelled"]:
break
time.sleep(5)
if status.status_type == "completed":
print("✓ Document processed successfully!")
print(f" Created {len(status.memory_items)} memories")
else:
print(f"✗ Processing failed: {status.error}")Step 3: Add Agent Legal Reasoning
The agent documents its own legal analysis workflow as a memory, enabling self-improvement.
# Agent documents its legal analysis workflow
agent_workflow = client.memory.add(
content="""
Legal Contract Analysis Workflow:
1. CONTRACT IDENTIFICATION
- Extract contract title, type, and parties
- Identify effective and expiration dates
- Determine jurisdiction and governing law
2. PARTY ANALYSIS
- Identify all parties and their roles
- Extract contact information and legal addresses
- Determine signing authority
3. FINANCIAL TERMS
- Extract contract value and payment terms
- Identify payment schedule and milestones
- Note any penalties or late fees
4. OBLIGATIONS AND RESPONSIBILITIES
- List each party's obligations
- Extract deadlines and milestones
- Identify deliverables
5. TERMINATION AND LIABILITY
- Extract termination conditions
- Identify liability limitations
- Note dispute resolution mechanisms
6. RED FLAGS
- Unusual or non-standard clauses
- Missing standard protections
- Ambiguous language requiring clarification
Success Rate: 92% accurate extraction
Common Issues: Date format variations, multi-party contracts
""",
metadata={
"role": "assistant", # This is an agent memory
"category": "learning",
"workflow_type": "legal_analysis",
"topics": ["contracts", "legal", "document_analysis"],
"success_rate": 0.92
},
graph_generation={
"mode": "auto",
"auto": {"simple_schema_mode": True}
}
)
print("✓ Agent workflow documented")Step 4: Query with Natural Language
Search for contract information using natural language.
# Find contracts expiring soon
search_response = client.memory.search(
query="What contracts are expiring in the next 90 days?",
enable_agentic_graph=True, # Enable graph-enhanced search
max_memories=20,
max_nodes=15 # Include graph entities
)
print(f"Found {len(search_response.data.memories)} relevant memories")
print(f"Found {len(search_response.data.nodes)} graph entities")
for node in search_response.data.nodes:
if node.label == "Contract":
print(f"\nContract: {node.properties.get('title')}")
print(f" Type: {node.properties.get('contract_type')}")
print(f" Status: {node.properties.get('status')}")
print(f" Expires: {node.properties.get('expiration_date')}")
# Find obligations for a specific party
obligations_response = client.memory.search(
query="What are Acme Corp's obligations in our contracts?",
enable_agentic_graph=True,
max_memories=20,
max_nodes=20
)
for node in obligations_response.data.nodes:
if node.label == "Obligation":
print(f"\nObligation: {node.properties.get('description')}")
print(f" Deadline: {node.properties.get('deadline')}")
print(f" Status: {node.properties.get('status')}")Step 5: Analyze with GraphQL
Use GraphQL for structured contract analytics.
# Get contract summary by type
contract_summary = client.graphql.query(
query="""
query ContractSummary {
contracts {
contract_type
count: aggregate {
count
}
total_value: aggregate {
sum(field: "value")
}
by_status {
status
count: aggregate {
count
}
}
}
}
"""
)
print("\n=== Contract Summary ===")
for contract_type in contract_summary.data['contracts']:
print(f"\n{contract_type['contract_type'].upper()}:")
print(f" Total: {contract_type['count']}")
print(f" Value: ${contract_type['total_value']:,.2f}")
for status in contract_type['by_status']:
print(f" {status['status']}: {status['count']}")
# Find contracts with upcoming obligations
upcoming_obligations = client.graphql.query(
query="""
query UpcomingObligations($startDate: DateTime!, $endDate: DateTime!) {
contracts {
title
contract_type
parties {
name
role
}
obligations(
where: {
deadline: { _gte: $startDate, _lte: $endDate }
status: { _in: ["pending", "in_progress"] }
}
) {
description
deadline
status
}
}
}
""",
variables={
"startDate": "2024-01-01T00:00:00Z",
"endDate": "2024-03-31T23:59:59Z"
}
)
print("\n=== Upcoming Obligations ===")
for contract in upcoming_obligations.data['contracts']:
if contract['obligations']:
print(f"\nContract: {contract['title']}")
for obligation in contract['obligations']:
print(f" - {obligation['description']}")
print(f" Deadline: {obligation['deadline']}")
print(f" Status: {obligation['status']}")
# Analyze party relationships
party_analysis = client.graphql.query(
query="""
query PartyAnalysis {
parties {
name
role
contracts: party_to_aggregate {
count
}
obligations: has_obligation_aggregate {
count
}
total_contract_value: contracts {
value
}
}
}
"""
)
print("\n=== Party Analysis ===")
for party in party_analysis.data['parties']:
print(f"\n{party['name']} ({party['role']})")
print(f" Contracts: {party['contracts']['count']}")
print(f" Obligations: {party['obligations']['count']}")
total_value = sum(c['value'] for c in party['total_contract_value'] if c['value'])
print(f" Total Value: ${total_value:,.2f}")Step 6: Agent Learning and Improvement
The agent can query its own documented workflows and improve based on results.
# Agent retrieves its own workflow documentation
workflow_memories = client.memory.search(
query="What is my workflow for analyzing legal contracts?",
metadata_filter={"role": "assistant", "workflow_type": "legal_analysis"},
max_memories=5
)
for memory in workflow_memories.data.memories:
print(f"Workflow: {memory.content[:200]}...")
print(f"Success Rate: {memory.metadata.get('success_rate', 'N/A')}")
# Agent documents an improvement to the workflow
improved_workflow = client.memory.add(
content="""
WORKFLOW IMPROVEMENT (v1.1):
Added step 2.5: BENEFICIAL OWNERSHIP ANALYSIS
- Identify ultimate beneficial owners
- Track ownership percentages
- Flag complex ownership structures
Rationale: 15% of contracts had unclear beneficial ownership,
causing delays in compliance checks.
Result: Reduced compliance review time by 25%
Updated Success Rate: 94% (up from 92%)
""",
metadata={
"role": "assistant",
"category": "learning",
"workflow_type": "legal_analysis",
"workflow_version": "1.1",
"improvement_type": "process_addition",
"success_rate": 0.94,
"previous_version": agent_workflow.data.id
},
graph_generation={
"mode": "auto",
"auto": {"simple_schema_mode": True}
}
)
print("✓ Agent documented workflow improvement")
print(f" Success rate improved from 92% to 94%")Complete Example Application
Here's a complete application that ties everything together:
class LegalDocumentAnalyzer:
def __init__(self, api_key: str, schema_id: str):
self.client = Papr(x_api_key=api_key)
self.schema_id = schema_id
def upload_contract(self, file_path: str) -> str:
"""Upload and process a legal contract"""
with open(file_path, "rb") as f:
response = self.client.document.upload(
file=f,
schema_id=self.schema_id,
simple_schema_mode=True,
hierarchical_enabled=True
)
return response.document_status.upload_id
def get_expiring_contracts(self, days: int = 90):
"""Find contracts expiring within specified days"""
response = self.client.memory.search(
query=f"What contracts are expiring in the next {days} days?",
enable_agentic_graph=True,
max_nodes=20
)
contracts = [
node for node in response.data.nodes
if node.label == "Contract"
]
return contracts
def analyze_party_obligations(self, party_name: str):
"""Get all obligations for a specific party"""
response = self.client.graphql.query(
query="""
query PartyObligations($partyName: String!) {
party(name: $partyName) {
name
role
obligations {
description
deadline
status
}
contracts {
title
contract_type
status
}
}
}
""",
variables={"partyName": party_name}
)
return response.data
def document_analysis_result(self, contract_id: str, findings: dict):
"""Agent documents analysis results for learning"""
self.client.memory.add(
content=f"""
Contract Analysis Result:
Contract ID: {contract_id}
Findings: {findings}
""",
metadata={
"role": "assistant",
"category": "analysis_result",
"contract_id": contract_id,
**findings
}
)
# Usage
analyzer = LegalDocumentAnalyzer(
api_key=os.environ.get("PAPR_MEMORY_API_KEY"),
schema_id=schema_id
)
# Upload contracts
upload_id = analyzer.upload_contract("service_agreement.pdf")
# Find expiring contracts
expiring = analyzer.get_expiring_contracts(days=90)
print(f"Found {len(expiring)} contracts expiring soon")
# Analyze party obligations
obligations = analyzer.analyze_party_obligations("Acme Corp")
print(f"Acme Corp has {len(obligations['party']['obligations'])} obligations")Key Takeaways
Dual Memory Architecture
- User memories: Contracts, parties, obligations extracted from documents
- Agent memories: Legal analysis workflows, learnings, improvements
Three Input Paths Demonstrated
- Documents: Uploaded legal contracts with schema-guided extraction
- Direct Memory: Agent's workflow documentation
- Messages: Could add conversational context (not shown in tutorial)
Two Query Modes Used
- Natural Language Search: "What contracts expire soon?"
- GraphQL: Structured analytics and aggregations
Agent Self-Improvement
The agent documents its own workflows and improvements, building institutional knowledge that persists across sessions.
Next Steps
- Document Processing Guide - Deep dive into document upload
- Custom Schemas Guide - Advanced schema design
- GraphQL Analysis Guide - Complex queries
- Knowledge Graphs Guide - Graph concepts
- Multi-tenant Guide - Isolate client data