Upload and ingest a file document. Supports various file types including PDFs, Word documents, presentations, and more. The file will be processed, chunked, and indexed for semantic search.
Parameters:
file: File to ingest (path string, bytes, file object, or Path)
filename: Name of the file
content_type: MIME type (optional, will be guessed if not provided)
metadata: Optional dictionary of metadata
rules: (Optional) List of processing rules to apply to extracted text
Returns: Document object with storage information including:
All fields from text documents
storage_info: Contains bucket and key information for file storage
filename: Original filename
content_type: MIME type of the file
from databridge import DataBridge, MetadataExtractionRule, NaturalLanguageRule
# Create client instance
db = DataBridge(uri="your-databridge-uri")
# Create processing rules (optional)
pii_rule = NaturalLanguageRule(
prompt="Remove all PII. Replace names with [NAME], emails with [EMAIL]"
)
classify_rule = MetadataExtractionRule(schema={
"type": "object",
"properties": {
"document_type": {"type": "string"},
"confidentiality": {"type": "string"}
}
})
# From file path with rules
doc = db.ingest_file(
file="presentation.pdf",
filename="Q4_Presentation.pdf",
content_type="application/pdf",
metadata={
"department": "Finance",
"year": 2024,
"quarter": 4
},
rules=[pii_rule, classify_rule] # Optional processing rules
)
print(f"Document ID: {doc.external_id}")
print(f"Storage location: {doc.storage_info['bucket']}/{doc.storage_info['key']}")
# From file object
with open("presentation.pptx", "rb") as f:
doc = db.ingest_file(
file=f,
filename="presentation.pptx",
rules=[pii_rule] # Rules work with file objects too
)