|
|
|
<!DOCTYPE html> |
|
|
|
|
|
<html lang="zh-CN" data-content_root="../" > |
|
|
|
<head> |
|
<meta charset="utf-8" /> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" /> |
|
|
|
<title>文档内容提取项目 — PDF-Extract-Kit 0.1.0 文档</title> |
|
|
|
|
|
|
|
<script data-cfasync="false"> |
|
document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
|
document.documentElement.dataset.theme = localStorage.getItem("theme") || ""; |
|
</script> |
|
|
|
|
|
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" /> |
|
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" /> |
|
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" /> |
|
|
|
|
|
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" /> |
|
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" /> |
|
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" /> |
|
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" /> |
|
|
|
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=a746c00c" /> |
|
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=a3416100" /> |
|
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" /> |
|
|
|
|
|
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" /> |
|
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" /> |
|
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script> |
|
|
|
<script src="../_static/documentation_options.js?v=2693749b"></script> |
|
<script src="../_static/doctools.js?v=9a2dae69"></script> |
|
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script> |
|
<script src="../_static/clipboard.min.js?v=a7894cd8"></script> |
|
<script src="../_static/copybutton.js?v=a5fa425f"></script> |
|
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script> |
|
<script src="../_static/translations.js?v=beaddf03"></script> |
|
<script>DOCUMENTATION_OPTIONS.pagename = 'project/pdf_extract';</script> |
|
<link rel="index" title="索引" href="../genindex.html" /> |
|
<link rel="search" title="搜索" href="../search.html" /> |
|
<link rel="next" title="文档翻译项目" href="doc_translate.html" /> |
|
<link rel="prev" title="PDF内容提取评测【端到端】" href="../evaluation/pdf_extract.html" /> |
|
<meta name="viewport" content="width=device-width, initial-scale=1"/> |
|
<meta name="docsearch:language" content="zh-CN"/> |
|
</head> |
|
|
|
|
|
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
|
|
|
|
|
|
|
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div> |
|
|
|
<div id="pst-scroll-pixel-helper"></div> |
|
|
|
<button type="button" class="btn rounded-pill" id="pst-back-to-top"> |
|
<i class="fa-solid fa-arrow-up"></i>Back to top</button> |
|
|
|
|
|
<input type="checkbox" |
|
class="sidebar-toggle" |
|
id="pst-primary-sidebar-checkbox"/> |
|
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label> |
|
|
|
<input type="checkbox" |
|
class="sidebar-toggle" |
|
id="pst-secondary-sidebar-checkbox"/> |
|
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label> |
|
|
|
<div class="search-button__wrapper"> |
|
<div class="search-button__overlay"></div> |
|
<div class="search-button__search-container"> |
|
<form class="bd-search d-flex align-items-center" |
|
action="../search.html" |
|
method="get"> |
|
<i class="fa-solid fa-magnifying-glass"></i> |
|
<input type="search" |
|
class="form-control" |
|
name="q" |
|
id="search-input" |
|
placeholder="Search..." |
|
aria-label="Search..." |
|
autocomplete="off" |
|
autocorrect="off" |
|
autocapitalize="off" |
|
spellcheck="false"/> |
|
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
|
</form></div> |
|
</div> |
|
|
|
<div class="pst-async-banner-revealer d-none"> |
|
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside> |
|
</div> |
|
|
|
|
|
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none"> |
|
</header> |
|
|
|
|
|
<div class="bd-container"> |
|
<div class="bd-container__inner bd-page-width"> |
|
|
|
|
|
|
|
<div class="bd-sidebar-primary bd-sidebar"> |
|
|
|
|
|
|
|
<div class="sidebar-header-items sidebar-primary__section"> |
|
|
|
|
|
|
|
|
|
</div> |
|
|
|
<div class="sidebar-primary-items__start sidebar-primary__section"> |
|
<div class="sidebar-primary-item"> |
|
|
|
|
|
|
|
|
|
|
|
<a class="navbar-brand logo" href="../index.html"> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<img src="../_static/logo.png" class="logo__image only-light" alt="PDF-Extract-Kit 0.1.0 文档 - Home"/> |
|
<script>document.write(`<img src="../_static/logo.png" class="logo__image only-dark" alt="PDF-Extract-Kit 0.1.0 文档 - Home"/>`);</script> |
|
|
|
|
|
</a></div> |
|
<div class="sidebar-primary-item"> |
|
|
|
<script> |
|
document.write(` |
|
<button class="btn search-button-field search-button__button" title="搜索" aria-label="搜索" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
|
<i class="fa-solid fa-magnifying-glass"></i> |
|
<span class="search-button__default-text">搜索</span> |
|
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span> |
|
</button> |
|
`); |
|
</script></div> |
|
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main"> |
|
<div class="bd-toc-item navbar-nav active"> |
|
<p aria-level="2" class="caption" role="heading"><span class="caption-text">快速上手</span></p> |
|
<ul class="nav bd-sidenav"> |
|
<li class="toctree-l1"><a class="reference internal" href="../get_started/installation.html">安装</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../get_started/pretrained_model.html">模型权重下载</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../get_started/quickstart.html">快速开始</a></li> |
|
</ul> |
|
<p aria-level="2" class="caption" role="heading"><span class="caption-text">基础算法模块</span></p> |
|
<ul class="nav bd-sidenav"> |
|
<li class="toctree-l1"><a class="reference internal" href="../algorithm/layout_detection.html">布局检测算法</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../algorithm/formula_detection.html">公式检测算法</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../algorithm/formula_recognition.html">公式识别算法</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../algorithm/ocr.html">光学字符识别(OCR)算法</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../algorithm/table_recognition.html">表格识别算法</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../algorithm/reading_order.html">阅读顺序算法</a></li> |
|
</ul> |
|
<p aria-level="2" class="caption" role="heading"><span class="caption-text">新任务拓展</span></p> |
|
<ul class="nav bd-sidenav"> |
|
<li class="toctree-l1"><a class="reference internal" href="../task_extend/code.html">代码实现</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../task_extend/doc.html">文档补充</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../task_extend/evaluation.html">模型评测</a></li> |
|
</ul> |
|
<p aria-level="2" class="caption" role="heading"><span class="caption-text">支持的模型列表</span></p> |
|
<ul class="nav bd-sidenav"> |
|
<li class="toctree-l1"><a class="reference internal" href="../models/supported.html">已支持的模型</a></li> |
|
</ul> |
|
<p aria-level="2" class="caption" role="heading"><span class="caption-text">模型性能评测</span></p> |
|
<ul class="nav bd-sidenav"> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/layout_detection.html">布局检测算法评测</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/formula_detection.html">公式检测算法评测</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/formula_recognition.html">公式识别算法评测</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/ocr.html">OCR算法评测</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/table_recognition.html">表格识别算法评测</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/reading_order.html">阅读顺序算法评测</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="../evaluation/pdf_extract.html">PDF内容提取评测【端到端】</a></li> |
|
</ul> |
|
<p aria-level="2" class="caption" role="heading"><span class="caption-text">PDF项目</span></p> |
|
<ul class="current nav bd-sidenav"> |
|
<li class="toctree-l1 current active"><a class="current reference internal" href="#">文档内容提取项目</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="doc_translate.html">文档翻译项目</a></li> |
|
<li class="toctree-l1"><a class="reference internal" href="speed_up.html">模型加速项目</a></li> |
|
</ul> |
|
|
|
</div> |
|
</nav></div> |
|
</div> |
|
|
|
|
|
<div class="sidebar-primary-items__end sidebar-primary__section"> |
|
</div> |
|
|
|
<div id="rtd-footer-container"></div> |
|
|
|
|
|
</div> |
|
|
|
<main id="main-content" class="bd-main" role="main"> |
|
|
|
|
|
|
|
<div class="sbt-scroll-pixel-helper"></div> |
|
|
|
<div class="bd-content"> |
|
<div class="bd-article-container"> |
|
|
|
<div class="bd-header-article d-print-none"> |
|
<div class="header-article-items header-article__inner"> |
|
|
|
<div class="header-article-items__start"> |
|
|
|
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
|
<span class="fa-solid fa-bars"></span> |
|
</button></div> |
|
|
|
</div> |
|
|
|
|
|
<div class="header-article-items__end"> |
|
|
|
<div class="header-article-item"> |
|
|
|
<div class="article-header-buttons"> |
|
|
|
|
|
<a href="https://github.com/opendatalab/PDF-Extract-Kit" target="_blank" |
|
class="btn btn-sm btn-source-repository-button" |
|
title="源码库" |
|
data-bs-placement="bottom" data-bs-toggle="tooltip" |
|
> |
|
|
|
|
|
<span class="btn__icon-container"> |
|
<i class="fab fa-github"></i> |
|
</span> |
|
|
|
</a> |
|
|
|
|
|
|
|
|
|
|
|
|
|
<div class="dropdown dropdown-download-buttons"> |
|
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="下载此页面"> |
|
<i class="fas fa-download"></i> |
|
</button> |
|
<ul class="dropdown-menu"> |
|
|
|
|
|
|
|
<li><a href="../_sources/project/pdf_extract.rst" target="_blank" |
|
class="btn btn-sm btn-download-source-button dropdown-item" |
|
title="下载源文件" |
|
data-bs-placement="left" data-bs-toggle="tooltip" |
|
> |
|
|
|
|
|
<span class="btn__icon-container"> |
|
<i class="fas fa-file"></i> |
|
</span> |
|
<span class="btn__text-container">.rst</span> |
|
</a> |
|
</li> |
|
|
|
|
|
|
|
|
|
<li> |
|
<button onclick="window.print()" |
|
class="btn btn-sm btn-download-pdf-button dropdown-item" |
|
title="列印成 PDF" |
|
data-bs-placement="left" data-bs-toggle="tooltip" |
|
> |
|
|
|
|
|
<span class="btn__icon-container"> |
|
<i class="fas fa-file-pdf"></i> |
|
</span> |
|
<span class="btn__text-container">.pdf</span> |
|
</button> |
|
</li> |
|
|
|
</ul> |
|
</div> |
|
|
|
|
|
|
|
|
|
<button onclick="toggleFullScreen()" |
|
class="btn btn-sm btn-fullscreen-button" |
|
title="全屏模式" |
|
data-bs-placement="bottom" data-bs-toggle="tooltip" |
|
> |
|
|
|
|
|
<span class="btn__icon-container"> |
|
<i class="fas fa-expand"></i> |
|
</span> |
|
|
|
</button> |
|
|
|
|
|
|
|
<script> |
|
document.write(` |
|
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
|
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i> |
|
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i> |
|
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i> |
|
</button> |
|
`); |
|
</script> |
|
|
|
|
|
<script> |
|
document.write(` |
|
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="搜索" aria-label="搜索" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
|
<i class="fa-solid fa-magnifying-glass fa-lg"></i> |
|
</button> |
|
`); |
|
</script> |
|
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
|
<span class="fa-solid fa-list"></span> |
|
</button> |
|
</div></div> |
|
|
|
</div> |
|
|
|
</div> |
|
</div> |
|
|
|
|
|
|
|
<div id="jb-print-docs-body" class="onlyprint"> |
|
<h1>文档内容提取项目</h1> |
|
|
|
<div id="print-main-content"> |
|
<div id="jb-print-toc"> |
|
|
|
<div> |
|
<h2> 目录 </h2> |
|
</div> |
|
<nav aria-label="Page"> |
|
<ul class="visible nav section-nav flex-column"> |
|
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#id2">简介</a></li> |
|
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#id3">项目使用</a><ul class="nav section-nav flex-column"> |
|
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id4">项目配置</a></li> |
|
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id5">多样化输入支持</a></li> |
|
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id6">输出结果</a></li> |
|
</ul> |
|
</li> |
|
</ul> |
|
</nav> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
|
|
|
|
<div id="searchbox"></div> |
|
<article class="bd-article"> |
|
|
|
<section id="id1"> |
|
<h1>文档内容提取项目<a class="headerlink" href="#id1" title="Link to this heading">#</a></h1> |
|
<section id="id2"> |
|
<h2>简介<a class="headerlink" href="#id2" title="Link to this heading">#</a></h2> |
|
<p>文档内容提取是利用布局检测,公式检测,公式识别,OCR等模型,提取文档中的信息,并转换为markdown文本。</p> |
|
</section> |
|
<section id="id3"> |
|
<h2>项目使用<a class="headerlink" href="#id3" title="Link to this heading">#</a></h2> |
|
<p>在配置好环境的情况下,直接执行 <code class="docutils literal notranslate"><span class="pre">project/pdf2markdown/scripts/run_project.py</span></code> 即可运行文档内容提取项目。</p> |
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>$<span class="w"> </span>python<span class="w"> </span>project/pdf2markdown/scripts/run_project.py<span class="w"> </span>--config<span class="w"> </span>project/pdf2markdown/configs/pdf2markdown.yaml |
|
</pre></div> |
|
</div> |
|
<section id="id4"> |
|
<h3>项目配置<a class="headerlink" href="#id4" title="Link to this heading">#</a></h3> |
|
<div class="highlight-yaml notranslate"><div class="highlight"><pre><span></span><span class="nt">inputs</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">assets/demo/formula_detection</span> |
|
<span class="nt">outputs</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">outputs/pdf2markdown</span> |
|
<span class="nt">visualize</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">True</span> |
|
<span class="nt">merge2markdown</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">True</span> |
|
<span class="nt">tasks</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">layout_detection</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">model</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">layout_detection_yolo</span> |
|
<span class="w"> </span><span class="nt">model_config</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">img_size</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1280</span> |
|
<span class="w"> </span><span class="nt">conf_thres</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.25</span> |
|
<span class="w"> </span><span class="nt">iou_thres</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.45</span> |
|
<span class="w"> </span><span class="nt">batch_size</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1</span> |
|
<span class="w"> </span><span class="nt">model_path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">models/Layout/YOLO/yolov10l_ft.pt</span> |
|
<span class="w"> </span><span class="nt">formula_detection</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">model</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">formula_detection_yolo</span> |
|
<span class="w"> </span><span class="nt">model_config</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">img_size</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1280</span> |
|
<span class="w"> </span><span class="nt">conf_thres</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.25</span> |
|
<span class="w"> </span><span class="nt">iou_thres</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.45</span> |
|
<span class="w"> </span><span class="nt">batch_size</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1</span> |
|
<span class="w"> </span><span class="nt">model_path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">models/MFD/YOLO/yolo_v8_ft.pt</span> |
|
<span class="w"> </span><span class="nt">formula_recognition</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">model</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">formula_recognition_unimernet</span> |
|
<span class="w"> </span><span class="nt">model_config</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">batch_size</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">128</span> |
|
<span class="w"> </span><span class="nt">cfg_path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">pdf_extract_kit/configs/unimernet.yaml</span> |
|
<span class="w"> </span><span class="nt">model_path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">models/MFR/unimernet_tiny</span> |
|
<span class="w"> </span><span class="nt">ocr</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">model</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">ocr_ppocr</span> |
|
<span class="w"> </span><span class="nt">model_config</span><span class="p">:</span> |
|
<span class="w"> </span><span class="nt">lang</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">ch</span> |
|
<span class="w"> </span><span class="nt">show_log</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">True</span> |
|
<span class="w"> </span><span class="nt">det_model_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">models/OCR/PaddleOCR/det/ch_PP-OCRv4_det</span> |
|
<span class="w"> </span><span class="nt">rec_model_dir</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">models/OCR/PaddleOCR/rec/ch_PP-OCRv4_rec</span> |
|
<span class="w"> </span><span class="nt">det_db_box_thresh</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.3</span> |
|
</pre></div> |
|
</div> |
|
<ul class="simple"> |
|
<li><p>inputs/outputs: 分别定义输入文件路径和输出路径</p></li> |
|
<li><p>visualize: 是否对模型结果进行可视化,可视化结果会保存在outputs目录下。</p></li> |
|
<li><p>merge2markdown: 是否将结果合并为markdown文档,这里只支持简单的单栏文本从上往下进行拼接,更复杂布局文档的markdown转换请参考 <a class="reference external" href="https://github.com/opendatalab/MinerU">MinerU</a></p></li> |
|
<li><p>tasks: 定义任务类型,PDF文档提取包含了布局检测、公式检测、公式识别、OCR等任务</p></li> |
|
<li><p>具体每个任务和模型的参数含义请参考各任务的教程文档</p></li> |
|
</ul> |
|
</section> |
|
<section id="id5"> |
|
<h3>多样化输入支持<a class="headerlink" href="#id5" title="Link to this heading">#</a></h3> |
|
<p>PDF文档内容提取支持 <code class="docutils literal notranslate"><span class="pre">单个图像/PDF文件</span></code> 、 <code class="docutils literal notranslate"><span class="pre">包含图像/PDF文件的目录</span></code> 等输入形式。</p> |
|
</section> |
|
<section id="id6"> |
|
<h3>输出结果<a class="headerlink" href="#id6" title="Link to this heading">#</a></h3> |
|
<p>PDF文档提取的结果以json形式保存在 <code class="docutils literal notranslate"><span class="pre">outputs</span></code> 路径下,json的格式如下所示:</p> |
|
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">[</span> |
|
<span class="w"> </span><span class="p">{</span> |
|
<span class="w"> </span><span class="nt">"layout_dets"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span> |
|
<span class="w"> </span><span class="p">{</span> |
|
<span class="w"> </span><span class="nt">"category_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"text"</span><span class="p">,</span> |
|
<span class="w"> </span><span class="nt">"poly"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span> |
|
<span class="w"> </span><span class="mf">380.6792698635707</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">159.85058512958923</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">765.1419999999998</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">159.85058512958923</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">765.1419999999998</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">192.51073013642917</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">380.6792698635707</span><span class="p">,</span> |
|
<span class="w"> </span><span class="mf">192.51073013642917</span> |
|
<span class="w"> </span><span class="p">],</span> |
|
<span class="w"> </span><span class="nt">"text"</span><span class="p">:</span><span class="w"> </span><span class="s2">"this is an example text"</span><span class="p">,</span> |
|
<span class="w"> </span><span class="nt">"score"</span><span class="p">:</span><span class="w"> </span><span class="mf">0.97</span> |
|
<span class="w"> </span><span class="p">},</span> |
|
<span class="w"> </span><span class="err">...</span> |
|
<span class="w"> </span><span class="p">],</span> |
|
<span class="w"> </span><span class="nt">"page_info"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span> |
|
<span class="w"> </span><span class="nt">"page_no"</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> |
|
<span class="w"> </span><span class="nt">"height"</span><span class="p">:</span><span class="w"> </span><span class="mi">2339</span><span class="p">,</span> |
|
<span class="w"> </span><span class="nt">"width"</span><span class="p">:</span><span class="w"> </span><span class="mi">1654</span><span class="p">,</span> |
|
<span class="w"> </span><span class="p">}</span> |
|
<span class="w"> </span><span class="p">},</span> |
|
<span class="w"> </span><span class="err">...</span> |
|
<span class="p">]</span> |
|
</pre></div> |
|
</div> |
|
<ul class="simple"> |
|
<li><p>layout_dets: 单页PDF或图片的内容提取结果</p></li> |
|
<li><p>category_type: 单个内容块的所属内别,比如标题、图片、行内公式等等</p></li> |
|
<li><p>poly: 单个内容块的位置坐标</p></li> |
|
<li><p>text: 该文本块的文本内容</p></li> |
|
<li><p>score: 检测的置信度</p></li> |
|
<li><p>page_info: 页面信息,包含页码和页面尺寸</p></li> |
|
<li><p>page_no: 页码,从0开始计数</p></li> |
|
<li><p>height: 页面尺寸: 高</p></li> |
|
<li><p>width: 页面尺寸: 宽</p></li> |
|
</ul> |
|
<p>如果 <code class="docutils literal notranslate"><span class="pre">merge2markdown</span></code> 参数为True的话,则会额外保存一个markdown文件。</p> |
|
</section> |
|
</section> |
|
</section> |
|
|
|
|
|
</article> |
|
|
|
|
|
|
|
|
|
|
|
|
|
<footer class="prev-next-footer d-print-none"> |
|
|
|
<div class="prev-next-area"> |
|
<a class="left-prev" |
|
href="../evaluation/pdf_extract.html" |
|
title="previous page"> |
|
<i class="fa-solid fa-angle-left"></i> |
|
<div class="prev-next-info"> |
|
<p class="prev-next-subtitle">上一页</p> |
|
<p class="prev-next-title">PDF内容提取评测【端到端】</p> |
|
</div> |
|
</a> |
|
<a class="right-next" |
|
href="doc_translate.html" |
|
title="next page"> |
|
<div class="prev-next-info"> |
|
<p class="prev-next-subtitle">下一页</p> |
|
<p class="prev-next-title">文档翻译项目</p> |
|
</div> |
|
<i class="fa-solid fa-angle-right"></i> |
|
</a> |
|
</div> |
|
</footer> |
|
|
|
</div> |
|
|
|
|
|
|
|
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
|
|
|
|
|
<div class="sidebar-secondary-item"> |
|
<div class="page-toc tocsection onthispage"> |
|
<i class="fa-solid fa-list"></i> 目录 |
|
</div> |
|
<nav class="bd-toc-nav page-toc"> |
|
<ul class="visible nav section-nav flex-column"> |
|
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#id2">简介</a></li> |
|
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#id3">项目使用</a><ul class="nav section-nav flex-column"> |
|
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id4">项目配置</a></li> |
|
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id5">多样化输入支持</a></li> |
|
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id6">输出结果</a></li> |
|
</ul> |
|
</li> |
|
</ul> |
|
</nav></div> |
|
|
|
</div></div> |
|
|
|
|
|
</div> |
|
<footer class="bd-footer-content"> |
|
|
|
<div class="bd-footer-content__inner container"> |
|
|
|
<div class="footer-item"> |
|
|
|
<p class="component-author"> |
|
作者: PDF-Extract-Kit Contributors |
|
</p> |
|
|
|
</div> |
|
|
|
<div class="footer-item"> |
|
|
|
|
|
<p class="copyright"> |
|
|
|
© Copyright 2024, OpenDataLab. |
|
<br/> |
|
|
|
</p> |
|
|
|
</div> |
|
|
|
<div class="footer-item"> |
|
|
|
</div> |
|
|
|
<div class="footer-item"> |
|
|
|
</div> |
|
|
|
</div> |
|
</footer> |
|
|
|
|
|
</main> |
|
</div> |
|
</div> |
|
|
|
|
|
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script> |
|
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script> |
|
|
|
<footer class="bd-footer"> |
|
</footer> |
|
</body> |
|
</html> |