feat: Removed Hard Dependency on Unstructured.io

- Added Llamaparse Support :)
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-05-30 19:17:19 -07:00
parent 5737ea80c0
commit 73751c0eb1
11 changed files with 402 additions and 84 deletions

View file

@ -1,2 +1,3 @@
NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD

View file

@ -42,38 +42,95 @@ export default function FileUploader() {
const router = useRouter();
const fileInputRef = useRef<HTMLInputElement>(null);
const acceptedFileTypes = {
'image/bmp': ['.bmp'],
'text/csv': ['.csv'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'message/rfc822': ['.eml'],
'application/epub+zip': ['.epub'],
'image/heic': ['.heic'],
'text/html': ['.html'],
'image/jpeg': ['.jpeg', '.jpg'],
'image/png': ['.png'],
'text/markdown': ['.md', '.markdown'],
'application/vnd.ms-outlook': ['.msg'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'text/x-org': ['.org'],
'application/pkcs7-signature': ['.p7s'],
'application/pdf': ['.pdf'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'text/x-rst': ['.rst'],
'application/rtf': ['.rtf'],
'image/tiff': ['.tiff'],
'text/plain': ['.txt'],
'text/tab-separated-values': ['.tsv'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/xml': ['.xml'],
// Audio files are always supported (using whisper)
const audioFileTypes = {
'audio/mpeg': ['.mp3', '.mpeg', '.mpga'],
'audio/mp4': ['.mp4', '.m4a'],
'audio/wav': ['.wav'],
'audio/webm': ['.webm'],
}
};
// Conditionally set accepted file types based on ETL service
const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD'
? {
// LlamaCloud supported file types
'application/pdf': ['.pdf'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
'application/msword-template': ['.dot'],
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'application/vnd.ms-powerpoint.template': ['.pot'],
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
'application/vnd.ms-excel.workspace': ['.xlw'],
'application/rtf': ['.rtf'],
'application/xml': ['.xml'],
'application/epub+zip': ['.epub'],
'application/vnd.apple.keynote': ['.key'],
'application/vnd.apple.pages': ['.pages'],
'application/vnd.apple.numbers': ['.numbers'],
'application/vnd.wordperfect': ['.wpd'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'application/vnd.oasis.opendocument.presentation': ['.odp'],
'application/vnd.oasis.opendocument.graphics': ['.odg'],
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
'application/vnd.oasis.opendocument.formula': ['.fods'],
'text/plain': ['.txt'],
'text/csv': ['.csv'],
'text/tab-separated-values': ['.tsv'],
'text/html': ['.html', '.htm', '.web'],
'image/jpeg': ['.jpg', '.jpeg'],
'image/png': ['.png'],
'image/gif': ['.gif'],
'image/bmp': ['.bmp'],
'image/svg+xml': ['.svg'],
'image/tiff': ['.tiff'],
'image/webp': ['.webp'],
'application/dbase': ['.dbf'],
'application/vnd.lotus-1-2-3': ['.123'],
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
// Audio files (always supported)
...audioFileTypes,
}
: {
// Unstructured supported file types
'image/bmp': ['.bmp'],
'text/csv': ['.csv'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'message/rfc822': ['.eml'],
'application/epub+zip': ['.epub'],
'image/heic': ['.heic'],
'text/html': ['.html'],
'image/jpeg': ['.jpeg', '.jpg'],
'image/png': ['.png'],
'text/markdown': ['.md', '.markdown'],
'application/vnd.ms-outlook': ['.msg'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'text/x-org': ['.org'],
'application/pkcs7-signature': ['.p7s'],
'application/pdf': ['.pdf'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'text/x-rst': ['.rst'],
'application/rtf': ['.rtf'],
'image/tiff': ['.tiff'],
'text/plain': ['.txt'],
'text/tab-separated-values': ['.tsv'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/xml': ['.xml'],
// Audio files (always supported)
...audioFileTypes,
};
const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()

View file

@ -90,7 +90,9 @@ Before you begin, ensure you have:
| FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
| STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
| LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
@ -136,6 +138,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
| ------------------------------- | ---------------------------------------------------------- |
| NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication |
| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
2. **Build and Start Containers**

View file

@ -61,7 +61,9 @@ Edit the `.env` file and set the following variables:
| FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
| STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
| LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
@ -182,6 +184,7 @@ Edit the `.env` file and set:
| ------------------------------- | ------------------------------------------- |
| NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication |
| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
### 2. Install Dependencies