mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-09 19:45:15 +02:00
Preserve get_page_tokens parser option
This commit is contained in:
parent
f438ab8dd7
commit
966a6be4d6
1 changed files with 6 additions and 7 deletions
|
|
@ -391,9 +391,9 @@ SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF")
|
|||
DEFAULT_PDF_PARSER = os.getenv("PAGEINDEX_PDF_PARSER") or SUPPORTED_PDF_PARSERS[0]
|
||||
|
||||
|
||||
def read_pdf_pages(doc):
|
||||
"""Return a list of per-page text strings using the currently configured parser."""
|
||||
parser = DEFAULT_PDF_PARSER
|
||||
def read_pdf_pages(doc, pdf_parser=None):
|
||||
"""Return a list of per-page text strings using the selected parser."""
|
||||
parser = pdf_parser or DEFAULT_PDF_PARSER
|
||||
|
||||
if parser == "PyPDF2":
|
||||
reader = PyPDF2.PdfReader(doc)
|
||||
|
|
@ -435,15 +435,14 @@ def read_pdf_pages(doc):
|
|||
d.close()
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported DEFAULT_PDF_PARSER={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}."
|
||||
f"Unsupported pdf_parser={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}."
|
||||
)
|
||||
|
||||
|
||||
def get_page_tokens(pdf_path, model=None):
|
||||
pages = read_pdf_pages(pdf_path)
|
||||
def get_page_tokens(pdf_path, model=None, pdf_parser=None):
|
||||
pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser)
|
||||
return [(text, litellm.token_counter(model=model, text=text)) for text in pages]
|
||||
|
||||
|
||||
|
||||
def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
|
||||
text = ""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue