Spaces:
Running
Running
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +192 -0
- README.md +3 -9
- file_cleaning.py +107 -0
- file_cleaning_ui.py +127 -0
- gr +0 -0
- os +0 -0
- pdfplumber +0 -0
- re +0 -0
- requirements.txt +1 -0
- venv/bin/Activate.ps1 +241 -0
- venv/bin/__pycache__/dumppdf.cpython-39.pyc +0 -0
- venv/bin/__pycache__/pdf2txt.cpython-39.pyc +0 -0
- venv/bin/activate +66 -0
- venv/bin/activate.csh +25 -0
- venv/bin/activate.fish +64 -0
- venv/bin/chardetect +8 -0
- venv/bin/dumppdf.py +480 -0
- venv/bin/f2py +8 -0
- venv/bin/fastapi +8 -0
- venv/bin/fonttools +8 -0
- venv/bin/gradio +8 -0
- venv/bin/hf +8 -0
- venv/bin/httpx +8 -0
- venv/bin/huggingface-cli +8 -0
- venv/bin/markdown-it +8 -0
- venv/bin/markdownify +8 -0
- venv/bin/normalizer +8 -0
- venv/bin/numpy-config +8 -0
- venv/bin/pdf2txt.py +323 -0
- venv/bin/pdfplumber +8 -0
- venv/bin/pip +8 -0
- venv/bin/pip3 +8 -0
- venv/bin/pip3.9 +8 -0
- venv/bin/pyftmerge +8 -0
- venv/bin/pyftsubset +8 -0
- venv/bin/pygmentize +8 -0
- venv/bin/pypdfium2 +8 -0
- venv/bin/python +3 -0
- venv/bin/python3 +3 -0
- venv/bin/python3.9 +3 -0
- venv/bin/ruff +3 -0
- venv/bin/tiny-agents +8 -0
- venv/bin/tqdm +8 -0
- venv/bin/ttx +8 -0
- venv/bin/typer +8 -0
- venv/bin/upload_theme +8 -0
- venv/bin/uvicorn +8 -0
- venv/lib/python3.9/site-packages/MarkupSafe-2.1.5.dist-info/INSTALLER +1 -0
- venv/lib/python3.9/site-packages/MarkupSafe-2.1.5.dist-info/LICENSE.rst +28 -0
- venv/lib/python3.9/site-packages/MarkupSafe-2.1.5.dist-info/METADATA +93 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,195 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
venv/bin/python filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
venv/bin/python3 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
venv/bin/python3.9 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
venv/bin/ruff filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
venv/lib/python3.9/site-packages/PIL/__pycache__/Image.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
venv/lib/python3.9/site-packages/PIL/_imaging.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
venv/lib/python3.9/site-packages/PIL/_imagingcms.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
venv/lib/python3.9/site-packages/PIL/_imagingft.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
venv/lib/python3.9/site-packages/PIL/_imagingmath.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
venv/lib/python3.9/site-packages/__pycache__/typing_extensions.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
venv/lib/python3.9/site-packages/_cffi_backend.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
venv/lib/python3.9/site-packages/charset_normalizer/md__mypyc.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
venv/lib/python3.9/site-packages/contourpy/_contourpy.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
venv/lib/python3.9/site-packages/cryptography/hazmat/bindings/_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
venv/lib/python3.9/site-packages/fontTools/__pycache__/agl.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
venv/lib/python3.9/site-packages/fontTools/cu2qu/cu2qu.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
venv/lib/python3.9/site-packages/fontTools/feaLib/lexer.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
venv/lib/python3.9/site-packages/fontTools/misc/bezierTools.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
venv/lib/python3.9/site-packages/fontTools/otlLib/__pycache__/builder.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
venv/lib/python3.9/site-packages/fontTools/pens/momentsPen.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
venv/lib/python3.9/site-packages/fontTools/qu2cu/qu2cu.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
venv/lib/python3.9/site-packages/fontTools/subset/__pycache__/__init__.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
venv/lib/python3.9/site-packages/fontTools/varLib/iup.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/component-test/src/lib/images/Duck.glb filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/component-test/src/lib/images/base.webp filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/component-test/src/lib/images/cantina.wav filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/component-test/src/lib/images/tower.jpg filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/component-test/src/lib/images/world.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/core/public/static/img/Duck.glb filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/spa/public/static/img/Duck.glb filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/spa/test/files/contract.pdf filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/spa/test/files/file_test.ogg filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
venv/lib/python3.9/site-packages/gradio/_frontend_code/spa/test/files/world.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
venv/lib/python3.9/site-packages/gradio/frpc_linux_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
venv/lib/python3.9/site-packages/gradio/templates/frontend/static/img/Duck.glb filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
venv/lib/python3.9/site-packages/gradio_client/__pycache__/media_data.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
venv/lib/python3.9/site-packages/hf_xet/hf_xet.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
venv/lib/python3.9/site-packages/huggingface_hub/__pycache__/hf_api.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
venv/lib/python3.9/site-packages/huggingface_hub/inference/__pycache__/_client.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
venv/lib/python3.9/site-packages/huggingface_hub/inference/_generated/__pycache__/_async_client.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
venv/lib/python3.9/site-packages/idna/__pycache__/uts46data.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
venv/lib/python3.9/site-packages/kiwisolver/_cext.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
venv/lib/python3.9/site-packages/lxml/_elementpath.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
venv/lib/python3.9/site-packages/lxml/builder.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
venv/lib/python3.9/site-packages/lxml/etree.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
venv/lib/python3.9/site-packages/lxml/html/_difflib.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
venv/lib/python3.9/site-packages/lxml/html/diff.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
venv/lib/python3.9/site-packages/lxml/objectify.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
venv/lib/python3.9/site-packages/lxml/sax.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
venv/lib/python3.9/site-packages/matplotlib/__pycache__/backend_bases.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
venv/lib/python3.9/site-packages/matplotlib/__pycache__/figure.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
venv/lib/python3.9/site-packages/matplotlib/__pycache__/patches.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
venv/lib/python3.9/site-packages/matplotlib/__pycache__/pyplot.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
venv/lib/python3.9/site-packages/matplotlib/__pycache__/widgets.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
venv/lib/python3.9/site-packages/matplotlib/_c_internal_utils.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
venv/lib/python3.9/site-packages/matplotlib/_image.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
venv/lib/python3.9/site-packages/matplotlib/_path.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
venv/lib/python3.9/site-packages/matplotlib/_qhull.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
venv/lib/python3.9/site-packages/matplotlib/_tri.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
venv/lib/python3.9/site-packages/matplotlib/_ttconv.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
venv/lib/python3.9/site-packages/matplotlib/axes/__pycache__/_axes.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
venv/lib/python3.9/site-packages/matplotlib/axes/__pycache__/_base.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
venv/lib/python3.9/site-packages/matplotlib/backends/_backend_agg.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
venv/lib/python3.9/site-packages/matplotlib/backends/_tkagg.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
venv/lib/python3.9/site-packages/matplotlib/ft2font.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans-Bold.ttf filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans-BoldOblique.ttf filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans-Oblique.ttf filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono-Bold.ttf filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono-BoldOblique.ttf filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono-Oblique.ttf filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono.ttf filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSerif-Bold.ttf filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSerif-BoldItalic.ttf filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSerif-Italic.ttf filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSerif.ttf filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/STIXGeneral.ttf filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/STIXGeneralBol.ttf filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/STIXGeneralBolIta.ttf filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
venv/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/STIXGeneralItalic.ttf filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
venv/lib/python3.9/site-packages/matplotlib/tests/__pycache__/test_axes.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
venv/lib/python3.9/site-packages/mpl_toolkits/mplot3d/__pycache__/axes3d.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
venv/lib/python3.9/site-packages/numpy/_core/__pycache__/_add_newdocs.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
venv/lib/python3.9/site-packages/numpy/_core/__pycache__/fromnumeric.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
venv/lib/python3.9/site-packages/numpy/_core/_multiarray_tests.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
venv/lib/python3.9/site-packages/numpy/_core/_multiarray_umath.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
venv/lib/python3.9/site-packages/numpy/_core/_simd.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
venv/lib/python3.9/site-packages/numpy/_core/tests/__pycache__/test_multiarray.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
venv/lib/python3.9/site-packages/numpy/_core/tests/__pycache__/test_numeric.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
venv/lib/python3.9/site-packages/numpy/_core/tests/__pycache__/test_regression.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 127 |
+
venv/lib/python3.9/site-packages/numpy/_core/tests/__pycache__/test_ufunc.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
venv/lib/python3.9/site-packages/numpy/_core/tests/__pycache__/test_umath.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
venv/lib/python3.9/site-packages/numpy/fft/_pocketfft_umath.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
venv/lib/python3.9/site-packages/numpy/lib/__pycache__/_function_base_impl.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 131 |
+
venv/lib/python3.9/site-packages/numpy/lib/tests/__pycache__/test_function_base.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 132 |
+
venv/lib/python3.9/site-packages/numpy/linalg/__pycache__/_linalg.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 133 |
+
venv/lib/python3.9/site-packages/numpy/linalg/_umath_linalg.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 134 |
+
venv/lib/python3.9/site-packages/numpy/ma/__pycache__/core.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 135 |
+
venv/lib/python3.9/site-packages/numpy/ma/tests/__pycache__/test_core.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 136 |
+
venv/lib/python3.9/site-packages/numpy/random/_bounded_integers.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 137 |
+
venv/lib/python3.9/site-packages/numpy/random/_common.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 138 |
+
venv/lib/python3.9/site-packages/numpy/random/_generator.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 139 |
+
venv/lib/python3.9/site-packages/numpy/random/_mt19937.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 140 |
+
venv/lib/python3.9/site-packages/numpy/random/_pcg64.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 141 |
+
venv/lib/python3.9/site-packages/numpy/random/_philox.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 142 |
+
venv/lib/python3.9/site-packages/numpy/random/bit_generator.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 143 |
+
venv/lib/python3.9/site-packages/numpy/random/mtrand.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 144 |
+
venv/lib/python3.9/site-packages/numpy.libs/libgfortran-040039e1-0352e75f.so.5.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 145 |
+
venv/lib/python3.9/site-packages/numpy.libs/libquadmath-96973f99-934c22de.so.0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 146 |
+
venv/lib/python3.9/site-packages/numpy.libs/libscipy_openblas64_-99b71e71.so filter=lfs diff=lfs merge=lfs -text
|
| 147 |
+
venv/lib/python3.9/site-packages/orjson/orjson.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 148 |
+
venv/lib/python3.9/site-packages/pandas/_libs/algos.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 149 |
+
venv/lib/python3.9/site-packages/pandas/_libs/arrays.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 150 |
+
venv/lib/python3.9/site-packages/pandas/_libs/groupby.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 151 |
+
venv/lib/python3.9/site-packages/pandas/_libs/hashing.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 152 |
+
venv/lib/python3.9/site-packages/pandas/_libs/hashtable.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 153 |
+
venv/lib/python3.9/site-packages/pandas/_libs/index.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 154 |
+
venv/lib/python3.9/site-packages/pandas/_libs/internals.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
venv/lib/python3.9/site-packages/pandas/_libs/interval.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 156 |
+
venv/lib/python3.9/site-packages/pandas/_libs/join.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
venv/lib/python3.9/site-packages/pandas/_libs/lib.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 158 |
+
venv/lib/python3.9/site-packages/pandas/_libs/missing.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 159 |
+
venv/lib/python3.9/site-packages/pandas/_libs/ops.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 160 |
+
venv/lib/python3.9/site-packages/pandas/_libs/parsers.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 161 |
+
venv/lib/python3.9/site-packages/pandas/_libs/reshape.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 162 |
+
venv/lib/python3.9/site-packages/pandas/_libs/sas.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 163 |
+
venv/lib/python3.9/site-packages/pandas/_libs/sparse.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 164 |
+
venv/lib/python3.9/site-packages/pandas/_libs/testing.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 165 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslib.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 166 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/conversion.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 167 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/dtypes.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 168 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/fields.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 169 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/nattype.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 170 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/np_datetime.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 171 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/offsets.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 172 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/parsing.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 173 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/period.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 174 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/strptime.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 175 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/timedeltas.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 176 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/timestamps.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 177 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/timezones.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 178 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/tzconversion.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 179 |
+
venv/lib/python3.9/site-packages/pandas/_libs/tslibs/vectorized.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 180 |
+
venv/lib/python3.9/site-packages/pandas/_libs/window/aggregations.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 181 |
+
venv/lib/python3.9/site-packages/pandas/_libs/window/indexers.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 182 |
+
venv/lib/python3.9/site-packages/pandas/_libs/writers.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 183 |
+
venv/lib/python3.9/site-packages/pandas/core/__pycache__/frame.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 184 |
+
venv/lib/python3.9/site-packages/pandas/core/__pycache__/generic.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 185 |
+
venv/lib/python3.9/site-packages/pandas/core/__pycache__/series.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 186 |
+
venv/lib/python3.9/site-packages/pandas/core/groupby/__pycache__/groupby.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 187 |
+
venv/lib/python3.9/site-packages/pandas/core/indexes/__pycache__/base.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 188 |
+
venv/lib/python3.9/site-packages/pandas/core/indexes/__pycache__/multi.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 189 |
+
venv/lib/python3.9/site-packages/pandas/core/strings/__pycache__/accessor.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 190 |
+
venv/lib/python3.9/site-packages/pandas/io/__pycache__/pytables.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 191 |
+
venv/lib/python3.9/site-packages/pandas/io/__pycache__/stata.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 192 |
+
venv/lib/python3.9/site-packages/pandas/io/formats/__pycache__/style.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 193 |
+
venv/lib/python3.9/site-packages/pandas/tests/frame/__pycache__/test_constructors.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 194 |
+
venv/lib/python3.9/site-packages/pandas/tests/indexing/__pycache__/test_loc.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 195 |
+
venv/lib/python3.9/site-packages/pandas/tests/io/__pycache__/test_sql.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 196 |
+
venv/lib/python3.9/site-packages/pandas/tests/tools/__pycache__/test_to_datetime.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 197 |
+
venv/lib/python3.9/site-packages/pdfminer/__pycache__/glyphlist.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 198 |
+
venv/lib/python3.9/site-packages/pillow.libs/libbrotlicommon-3ecfe81c.so.1 filter=lfs diff=lfs merge=lfs -text
|
| 199 |
+
venv/lib/python3.9/site-packages/pillow.libs/libfreetype-be14bf51.so.6.20.1 filter=lfs diff=lfs merge=lfs -text
|
| 200 |
+
venv/lib/python3.9/site-packages/pillow.libs/libharfbuzz-89381d8f.so.0.60850.0 filter=lfs diff=lfs merge=lfs -text
|
| 201 |
+
venv/lib/python3.9/site-packages/pillow.libs/libjpeg-77ae51ab.so.62.4.0 filter=lfs diff=lfs merge=lfs -text
|
| 202 |
+
venv/lib/python3.9/site-packages/pillow.libs/liblcms2-e69eef39.so.2.0.16 filter=lfs diff=lfs merge=lfs -text
|
| 203 |
+
venv/lib/python3.9/site-packages/pillow.libs/liblzma-13fa198c.so.5.4.5 filter=lfs diff=lfs merge=lfs -text
|
| 204 |
+
venv/lib/python3.9/site-packages/pillow.libs/libopenjp2-05423b53.so filter=lfs diff=lfs merge=lfs -text
|
| 205 |
+
venv/lib/python3.9/site-packages/pillow.libs/libpng16-58efbb84.so.16.43.0 filter=lfs diff=lfs merge=lfs -text
|
| 206 |
+
venv/lib/python3.9/site-packages/pillow.libs/libtiff-0a86184d.so.6.0.2 filter=lfs diff=lfs merge=lfs -text
|
| 207 |
+
venv/lib/python3.9/site-packages/pillow.libs/libwebp-2fd3cdca.so.7.1.9 filter=lfs diff=lfs merge=lfs -text
|
| 208 |
+
venv/lib/python3.9/site-packages/pillow.libs/libxcb-b8a56d01.so.1.1.0 filter=lfs diff=lfs merge=lfs -text
|
| 209 |
+
venv/lib/python3.9/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
|
| 210 |
+
venv/lib/python3.9/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
|
| 211 |
+
venv/lib/python3.9/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
|
| 212 |
+
venv/lib/python3.9/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 213 |
+
venv/lib/python3.9/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 214 |
+
venv/lib/python3.9/site-packages/pip/_vendor/pyparsing/__pycache__/core.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 215 |
+
venv/lib/python3.9/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 216 |
+
venv/lib/python3.9/site-packages/pkg_resources/__pycache__/__init__.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 217 |
+
venv/lib/python3.9/site-packages/pkg_resources/_vendor/__pycache__/pyparsing.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 218 |
+
venv/lib/python3.9/site-packages/pycparser/__pycache__/yacctab.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 219 |
+
venv/lib/python3.9/site-packages/pydantic_core/__pycache__/core_schema.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 220 |
+
venv/lib/python3.9/site-packages/pydantic_core/_pydantic_core.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 221 |
+
venv/lib/python3.9/site-packages/pygments/lexers/__pycache__/lisp.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 222 |
+
venv/lib/python3.9/site-packages/pyparsing/__pycache__/core.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 223 |
+
venv/lib/python3.9/site-packages/pypdfium2_raw/libpdfium.so filter=lfs diff=lfs merge=lfs -text
|
| 224 |
+
venv/lib/python3.9/site-packages/rich/__pycache__/_emoji_codes.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 225 |
+
venv/lib/python3.9/site-packages/setuptools/_vendor/__pycache__/pyparsing.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 226 |
+
venv/lib/python3.9/site-packages/setuptools/_vendor/more_itertools/__pycache__/more.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
|
| 227 |
+
venv/lib/python3.9/site-packages/yaml/_yaml.cpython-39-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: purple
|
| 5 |
-
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Web_Scraping_to_Markdown_for_RAG
|
| 3 |
+
app_file: file_cleaning_ui.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
+
sdk_version: 4.44.1
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
file_cleaning.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
import argparse
|
| 3 |
+
import re
|
| 4 |
+
import pdfplumber
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
def clean_text_for_rag(text: str) -> str:
|
| 8 |
+
"""
|
| 9 |
+
Nettoie le texte pour RAG : normalise les caractères spéciaux,
|
| 10 |
+
puis garde uniquement lettres (accents), chiffres, espaces et ponctuation simple.
|
| 11 |
+
"""
|
| 12 |
+
# Normalisation des caractères spéciaux
|
| 13 |
+
text = re.sub(
|
| 14 |
+
r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
|
| 15 |
+
lambda m: {
|
| 16 |
+
"’": "'", "‘": "'", "“": '"', "”": '"', "«": '"', "»": '"',
|
| 17 |
+
"–": "-", "—": "-", "…": "...", "œ": "oe", "Œ": "OE", "æ": "ae", "Æ": "AE",
|
| 18 |
+
"©": "(c)", "®": "(R)", "™": "TM", "§": "§", "°": "°", "±": "+/-", "×": "x", "÷": "/"
|
| 19 |
+
}.get(m.group(0), m.group(0)),
|
| 20 |
+
text
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Nettoyage strict : garder lettres, chiffres, espaces, ponctuation simple
|
| 24 |
+
text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\(\)\–\…]', '', text)
|
| 25 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 26 |
+
|
| 27 |
+
return text
|
| 28 |
+
|
| 29 |
+
def extract_and_clean_pdf(pdf_path, output_path):
|
| 30 |
+
"""
|
| 31 |
+
Extrait le texte du PDF et le nettoie pour RAG.
|
| 32 |
+
"""
|
| 33 |
+
print(f"[+] Extraction du texte depuis : {pdf_path}")
|
| 34 |
+
|
| 35 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 36 |
+
all_text = []
|
| 37 |
+
for page in pdf.pages:
|
| 38 |
+
text = page.extract_text()
|
| 39 |
+
if text:
|
| 40 |
+
all_text.append(text)
|
| 41 |
+
|
| 42 |
+
# Concaténer tout le texte
|
| 43 |
+
full_text = ' '.join(all_text)
|
| 44 |
+
# Nettoyer pour RAG
|
| 45 |
+
cleaned_text = clean_text_for_rag(full_text)
|
| 46 |
+
|
| 47 |
+
# Sauvegarder
|
| 48 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 49 |
+
f.write(cleaned_text)
|
| 50 |
+
|
| 51 |
+
print(f"[+] Texte nettoyé sauvegardé dans : {output_path}")
|
| 52 |
+
|
| 53 |
+
def extract_text(file_path, output_path):
|
| 54 |
+
"""
|
| 55 |
+
Extrait le texte des autres formats de fichier et le nettoie.
|
| 56 |
+
"""
|
| 57 |
+
print(f"[+] Extraction du texte depuis : {file_path}")
|
| 58 |
+
try:
|
| 59 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 60 |
+
lines = f.readlines()
|
| 61 |
+
|
| 62 |
+
# Nettoyer chaque ligne
|
| 63 |
+
cleaned_lines = [clean_text_for_rag(line.strip()) for line in lines if line.strip()]
|
| 64 |
+
|
| 65 |
+
# Joindre avec sauts de ligne pour un bon format .md
|
| 66 |
+
cleaned_text = '\n'.join(cleaned_lines)
|
| 67 |
+
|
| 68 |
+
# Sauvegarder dans le fichier .md
|
| 69 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 70 |
+
f.write(cleaned_text)
|
| 71 |
+
|
| 72 |
+
print(f"[+] Texte nettoyé sauvegardé dans : {output_path}")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"Erreur lors de l'ouverture du fichier {file_path} : {e}")
|
| 75 |
+
raise
|
| 76 |
+
|
| 77 |
+
def main():
|
| 78 |
+
parser = argparse.ArgumentParser(description='Extraire et nettoyer un fichier pour le RAG. Sortie toujours en .md.')
|
| 79 |
+
parser.add_argument('input_file', type=str, help='Chemin vers le fichier à traiter (PDF, TXT, MD, etc.).')
|
| 80 |
+
parser.add_argument('output_md', type=str, help='Chemin du fichier Markdown (.md) de sortie.')
|
| 81 |
+
|
| 82 |
+
args = parser.parse_args()
|
| 83 |
+
|
| 84 |
+
input_path = args.input_file
|
| 85 |
+
output_path = args.output_md # On garde le nom original
|
| 86 |
+
|
| 87 |
+
# Vérifier que le fichier existe
|
| 88 |
+
if not os.path.exists(input_path):
|
| 89 |
+
print(f"Erreur : Le fichier {input_path} n'existe pas.")
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
# Détecter l'extension du fichier d'entrée
|
| 93 |
+
_, ext = os.path.splitext(input_path.lower())
|
| 94 |
+
|
| 95 |
+
# Vérifier et corriger l'extension de sortie si nécessaire
|
| 96 |
+
if not output_path.lower().endswith('.md'):
|
| 97 |
+
print(f"[!] Avertissement : Le fichier de sortie n'a pas l'extension .md. Il sera renommé en .md.")
|
| 98 |
+
output_path = os.path.splitext(output_path)[0] + '.md'
|
| 99 |
+
|
| 100 |
+
# Appeler la bonne fonction selon le format
|
| 101 |
+
if ext == '.pdf':
|
| 102 |
+
extract_and_clean_pdf(input_path, output_path)
|
| 103 |
+
else:
|
| 104 |
+
extract_text(input_path, output_path)
|
| 105 |
+
|
| 106 |
+
if __name__ == '__main__':
|
| 107 |
+
main()
|
file_cleaning_ui.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import tempfile
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pdfplumber
|
| 9 |
+
import gradio as gr
|
| 10 |
+
|
| 11 |
+
def clean_text_for_rag(text: str) -> str:
|
| 12 |
+
"""Normalise et nettoie le texte pour un usage RAG."""
|
| 13 |
+
# Normalisation des caractères typographiques
|
| 14 |
+
text = re.sub(
|
| 15 |
+
r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
|
| 16 |
+
lambda m: {
|
| 17 |
+
"’": "'", "‘": "'", "“": '"', "”": '"',
|
| 18 |
+
"«": '"', "»": '"', "–": "-", "—": "-",
|
| 19 |
+
"…": "...", "œ": "oe", "Œ": "OE",
|
| 20 |
+
"æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
|
| 21 |
+
"™": "TM", "§": "§", "°": "°", "±": "+/-",
|
| 22 |
+
"×": "x", "÷": "/"
|
| 23 |
+
}.get(m.group(0), m.group(0)),
|
| 24 |
+
text,
|
| 25 |
+
)
|
| 26 |
+
# Conserver uniquement les caractères suivants
|
| 27 |
+
text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
|
| 28 |
+
# Réduire les espaces multiples
|
| 29 |
+
return re.sub(r'\s+', ' ', text).strip()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def extract_and_clean_pdf(pdf_path: str) -> str:
|
| 33 |
+
"""Ouvre le PDF, récupère le texte et le nettoie."""
|
| 34 |
+
print(f"[+] Extraction du PDF : {pdf_path}")
|
| 35 |
+
all_pages = []
|
| 36 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 37 |
+
for page in pdf.pages:
|
| 38 |
+
txt = page.extract_text()
|
| 39 |
+
if txt:
|
| 40 |
+
all_pages.append(txt)
|
| 41 |
+
return clean_text_for_rag(" ".join(all_pages))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extract_and_clean_txt(txt_path: str) -> str:
|
| 45 |
+
"""Lit un fichier texte (txt, md, …) et le nettoie."""
|
| 46 |
+
print(f"[+] Lecture du fichier texte : {txt_path}")
|
| 47 |
+
with open(txt_path, "r", encoding="utf-8") as f:
|
| 48 |
+
lines = f.readlines()
|
| 49 |
+
cleaned = [
|
| 50 |
+
clean_text_for_rag(line.strip())
|
| 51 |
+
for line in lines
|
| 52 |
+
if line.strip()
|
| 53 |
+
]
|
| 54 |
+
return "\n".join(cleaned)
|
| 55 |
+
|
| 56 |
+
def process_file(input_file: gr.File, output_name: str) -> str:
|
| 57 |
+
"""
|
| 58 |
+
- Detecte le type (PDF ou texte)
|
| 59 |
+
- Effectue l'extraction + nettoyage
|
| 60 |
+
- Crée un fichier temporaire **avec le nom choisi** (output_name)
|
| 61 |
+
- Retourne le chemin du fichier temporaire (Gradio le propose en téléchargement)
|
| 62 |
+
"""
|
| 63 |
+
input_path = input_file.name
|
| 64 |
+
_, ext = os.path.splitext(input_path.lower())
|
| 65 |
+
|
| 66 |
+
if ext == ".pdf":
|
| 67 |
+
cleaned_text = extract_and_clean_pdf(input_path)
|
| 68 |
+
else:
|
| 69 |
+
cleaned_text = extract_and_clean_txt(input_path)
|
| 70 |
+
|
| 71 |
+
output_name = output_name.strip()
|
| 72 |
+
if not output_name.lower().endswith(".md"):
|
| 73 |
+
output_name = f"{output_name}.md"
|
| 74 |
+
|
| 75 |
+
temp_dir = tempfile.mkdtemp()
|
| 76 |
+
out_path = os.path.join(temp_dir, output_name)
|
| 77 |
+
|
| 78 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 79 |
+
f.write(cleaned_text)
|
| 80 |
+
|
| 81 |
+
return out_path
|
| 82 |
+
|
| 83 |
+
with gr.Blocks(title="Nettoyage de texte pour RAG") as demo:
|
| 84 |
+
gr.Markdown("# 📄 Nettoyage de texte pour RAG (PDF, TXT, MD)")
|
| 85 |
+
gr.Markdown(
|
| 86 |
+
"Déposez un fichier PDF, TXT ou MD. Le texte sera extrait, nettoyé "
|
| 87 |
+
"et vous pourrez le télécharger sous **le nom que vous choisissez**."
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
with gr.Row():
|
| 91 |
+
with gr.Column(scale=1):
|
| 92 |
+
input_file = gr.File(
|
| 93 |
+
label="Déposez votre fichier ici",
|
| 94 |
+
file_types=["pdf", "txt", "md", "file"],
|
| 95 |
+
)
|
| 96 |
+
output_name = gr.Textbox(
|
| 97 |
+
value="output.md",
|
| 98 |
+
label="Nom du fichier de sortie (en .md)",
|
| 99 |
+
placeholder="exemple.md",
|
| 100 |
+
interactive=True,
|
| 101 |
+
)
|
| 102 |
+
submit_btn = gr.Button("Traiter le fichier", variant="primary")
|
| 103 |
+
with gr.Column(scale=1):
|
| 104 |
+
output_file = gr.File(
|
| 105 |
+
label="Fichier nettoyé (.md)",
|
| 106 |
+
file_types=["md"],
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
submit_btn.click(
|
| 110 |
+
fn=process_file,
|
| 111 |
+
inputs=[input_file, output_name],
|
| 112 |
+
outputs=output_file,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
gr.Markdown(
|
| 116 |
+
"""
|
| 117 |
+
---
|
| 118 |
+
**Nettoyage effectué :**
|
| 119 |
+
- Suppression des symboles non imprimables / caractères parasites
|
| 120 |
+
- Conservation : lettres (avec accents), chiffres, espaces, ponctuation simple
|
| 121 |
+
- Normalisation des espaces
|
| 122 |
+
- Sortie toujours au format **`.md`**
|
| 123 |
+
"""
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
demo.launch(share=True)
|
gr
ADDED
|
File without changes
|
os
ADDED
|
File without changes
|
pdfplumber
ADDED
|
File without changes
|
re
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pdfplumber
|
venv/bin/Activate.ps1
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<#
|
| 2 |
+
.Synopsis
|
| 3 |
+
Activate a Python virtual environment for the current PowerShell session.
|
| 4 |
+
|
| 5 |
+
.Description
|
| 6 |
+
Pushes the python executable for a virtual environment to the front of the
|
| 7 |
+
$Env:PATH environment variable and sets the prompt to signify that you are
|
| 8 |
+
in a Python virtual environment. Makes use of the command line switches as
|
| 9 |
+
well as the `pyvenv.cfg` file values present in the virtual environment.
|
| 10 |
+
|
| 11 |
+
.Parameter VenvDir
|
| 12 |
+
Path to the directory that contains the virtual environment to activate. The
|
| 13 |
+
default value for this is the parent of the directory that the Activate.ps1
|
| 14 |
+
script is located within.
|
| 15 |
+
|
| 16 |
+
.Parameter Prompt
|
| 17 |
+
The prompt prefix to display when this virtual environment is activated. By
|
| 18 |
+
default, this prompt is the name of the virtual environment folder (VenvDir)
|
| 19 |
+
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
| 20 |
+
|
| 21 |
+
.Example
|
| 22 |
+
Activate.ps1
|
| 23 |
+
Activates the Python virtual environment that contains the Activate.ps1 script.
|
| 24 |
+
|
| 25 |
+
.Example
|
| 26 |
+
Activate.ps1 -Verbose
|
| 27 |
+
Activates the Python virtual environment that contains the Activate.ps1 script,
|
| 28 |
+
and shows extra information about the activation as it executes.
|
| 29 |
+
|
| 30 |
+
.Example
|
| 31 |
+
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
| 32 |
+
Activates the Python virtual environment located in the specified location.
|
| 33 |
+
|
| 34 |
+
.Example
|
| 35 |
+
Activate.ps1 -Prompt "MyPython"
|
| 36 |
+
Activates the Python virtual environment that contains the Activate.ps1 script,
|
| 37 |
+
and prefixes the current prompt with the specified string (surrounded in
|
| 38 |
+
parentheses) while the virtual environment is active.
|
| 39 |
+
|
| 40 |
+
.Notes
|
| 41 |
+
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
| 42 |
+
execution policy for the user. You can do this by issuing the following PowerShell
|
| 43 |
+
command:
|
| 44 |
+
|
| 45 |
+
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
| 46 |
+
|
| 47 |
+
For more information on Execution Policies:
|
| 48 |
+
https://go.microsoft.com/fwlink/?LinkID=135170
|
| 49 |
+
|
| 50 |
+
#>
|
| 51 |
+
Param(
|
| 52 |
+
[Parameter(Mandatory = $false)]
|
| 53 |
+
[String]
|
| 54 |
+
$VenvDir,
|
| 55 |
+
[Parameter(Mandatory = $false)]
|
| 56 |
+
[String]
|
| 57 |
+
$Prompt
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
<# Function declarations --------------------------------------------------- #>
|
| 61 |
+
|
| 62 |
+
<#
|
| 63 |
+
.Synopsis
|
| 64 |
+
Remove all shell session elements added by the Activate script, including the
|
| 65 |
+
addition of the virtual environment's Python executable from the beginning of
|
| 66 |
+
the PATH variable.
|
| 67 |
+
|
| 68 |
+
.Parameter NonDestructive
|
| 69 |
+
If present, do not remove this function from the global namespace for the
|
| 70 |
+
session.
|
| 71 |
+
|
| 72 |
+
#>
|
| 73 |
+
function global:deactivate ([switch]$NonDestructive) {
|
| 74 |
+
# Revert to original values
|
| 75 |
+
|
| 76 |
+
# The prior prompt:
|
| 77 |
+
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
| 78 |
+
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
| 79 |
+
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# The prior PYTHONHOME:
|
| 83 |
+
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
| 84 |
+
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
| 85 |
+
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
# The prior PATH:
|
| 89 |
+
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
| 90 |
+
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
| 91 |
+
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Just remove the VIRTUAL_ENV altogether:
|
| 95 |
+
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
| 96 |
+
Remove-Item -Path env:VIRTUAL_ENV
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
| 100 |
+
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
| 101 |
+
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# Leave deactivate function in the global namespace if requested:
|
| 105 |
+
if (-not $NonDestructive) {
|
| 106 |
+
Remove-Item -Path function:deactivate
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
<#
|
| 111 |
+
.Description
|
| 112 |
+
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
| 113 |
+
given folder, and returns them in a map.
|
| 114 |
+
|
| 115 |
+
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
| 116 |
+
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
| 117 |
+
then it is considered a `key = value` line. The left hand string is the key,
|
| 118 |
+
the right hand is the value.
|
| 119 |
+
|
| 120 |
+
If the value starts with a `'` or a `"` then the first and last character is
|
| 121 |
+
stripped from the value before being captured.
|
| 122 |
+
|
| 123 |
+
.Parameter ConfigDir
|
| 124 |
+
Path to the directory that contains the `pyvenv.cfg` file.
|
| 125 |
+
#>
|
| 126 |
+
function Get-PyVenvConfig(
|
| 127 |
+
[String]
|
| 128 |
+
$ConfigDir
|
| 129 |
+
) {
|
| 130 |
+
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
| 131 |
+
|
| 132 |
+
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
| 133 |
+
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
| 134 |
+
|
| 135 |
+
# An empty map will be returned if no config file is found.
|
| 136 |
+
$pyvenvConfig = @{ }
|
| 137 |
+
|
| 138 |
+
if ($pyvenvConfigPath) {
|
| 139 |
+
|
| 140 |
+
Write-Verbose "File exists, parse `key = value` lines"
|
| 141 |
+
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
| 142 |
+
|
| 143 |
+
$pyvenvConfigContent | ForEach-Object {
|
| 144 |
+
$keyval = $PSItem -split "\s*=\s*", 2
|
| 145 |
+
if ($keyval[0] -and $keyval[1]) {
|
| 146 |
+
$val = $keyval[1]
|
| 147 |
+
|
| 148 |
+
# Remove extraneous quotations around a string value.
|
| 149 |
+
if ("'""".Contains($val.Substring(0, 1))) {
|
| 150 |
+
$val = $val.Substring(1, $val.Length - 2)
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
$pyvenvConfig[$keyval[0]] = $val
|
| 154 |
+
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
return $pyvenvConfig
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
<# Begin Activate script --------------------------------------------------- #>
|
| 163 |
+
|
| 164 |
+
# Determine the containing directory of this script
|
| 165 |
+
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
| 166 |
+
$VenvExecDir = Get-Item -Path $VenvExecPath
|
| 167 |
+
|
| 168 |
+
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
| 169 |
+
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
| 170 |
+
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
| 171 |
+
|
| 172 |
+
# Set values required in priority: CmdLine, ConfigFile, Default
|
| 173 |
+
# First, get the location of the virtual environment, it might not be
|
| 174 |
+
# VenvExecDir if specified on the command line.
|
| 175 |
+
if ($VenvDir) {
|
| 176 |
+
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
| 177 |
+
}
|
| 178 |
+
else {
|
| 179 |
+
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
| 180 |
+
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
| 181 |
+
Write-Verbose "VenvDir=$VenvDir"
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
# Next, read the `pyvenv.cfg` file to determine any required value such
|
| 185 |
+
# as `prompt`.
|
| 186 |
+
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
| 187 |
+
|
| 188 |
+
# Next, set the prompt from the command line, or the config file, or
|
| 189 |
+
# just use the name of the virtual environment folder.
|
| 190 |
+
if ($Prompt) {
|
| 191 |
+
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
| 192 |
+
}
|
| 193 |
+
else {
|
| 194 |
+
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
| 195 |
+
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
| 196 |
+
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
| 197 |
+
$Prompt = $pyvenvCfg['prompt'];
|
| 198 |
+
}
|
| 199 |
+
else {
|
| 200 |
+
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
| 201 |
+
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
| 202 |
+
$Prompt = Split-Path -Path $venvDir -Leaf
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
Write-Verbose "Prompt = '$Prompt'"
|
| 207 |
+
Write-Verbose "VenvDir='$VenvDir'"
|
| 208 |
+
|
| 209 |
+
# Deactivate any currently active virtual environment, but leave the
|
| 210 |
+
# deactivate function in place.
|
| 211 |
+
deactivate -nondestructive
|
| 212 |
+
|
| 213 |
+
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
| 214 |
+
# that there is an activated venv.
|
| 215 |
+
$env:VIRTUAL_ENV = $VenvDir
|
| 216 |
+
|
| 217 |
+
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
| 218 |
+
|
| 219 |
+
Write-Verbose "Setting prompt to '$Prompt'"
|
| 220 |
+
|
| 221 |
+
# Set the prompt to include the env name
|
| 222 |
+
# Make sure _OLD_VIRTUAL_PROMPT is global
|
| 223 |
+
function global:_OLD_VIRTUAL_PROMPT { "" }
|
| 224 |
+
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
| 225 |
+
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
| 226 |
+
|
| 227 |
+
function global:prompt {
|
| 228 |
+
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
| 229 |
+
_OLD_VIRTUAL_PROMPT
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
# Clear PYTHONHOME
|
| 234 |
+
if (Test-Path -Path Env:PYTHONHOME) {
|
| 235 |
+
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
| 236 |
+
Remove-Item -Path Env:PYTHONHOME
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# Add the venv to the PATH
|
| 240 |
+
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
| 241 |
+
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
venv/bin/__pycache__/dumppdf.cpython-39.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
venv/bin/__pycache__/pdf2txt.cpython-39.pyc
ADDED
|
Binary file (7.58 kB). View file
|
|
|
venv/bin/activate
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file must be used with "source bin/activate" *from bash*
|
| 2 |
+
# you cannot run it directly
|
| 3 |
+
|
| 4 |
+
deactivate () {
|
| 5 |
+
# reset old environment variables
|
| 6 |
+
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
|
| 7 |
+
PATH="${_OLD_VIRTUAL_PATH:-}"
|
| 8 |
+
export PATH
|
| 9 |
+
unset _OLD_VIRTUAL_PATH
|
| 10 |
+
fi
|
| 11 |
+
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
|
| 12 |
+
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
|
| 13 |
+
export PYTHONHOME
|
| 14 |
+
unset _OLD_VIRTUAL_PYTHONHOME
|
| 15 |
+
fi
|
| 16 |
+
|
| 17 |
+
# This should detect bash and zsh, which have a hash command that must
|
| 18 |
+
# be called to get it to forget past commands. Without forgetting
|
| 19 |
+
# past commands the $PATH changes we made may not be respected
|
| 20 |
+
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
| 21 |
+
hash -r 2> /dev/null
|
| 22 |
+
fi
|
| 23 |
+
|
| 24 |
+
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
|
| 25 |
+
PS1="${_OLD_VIRTUAL_PS1:-}"
|
| 26 |
+
export PS1
|
| 27 |
+
unset _OLD_VIRTUAL_PS1
|
| 28 |
+
fi
|
| 29 |
+
|
| 30 |
+
unset VIRTUAL_ENV
|
| 31 |
+
if [ ! "${1:-}" = "nondestructive" ] ; then
|
| 32 |
+
# Self destruct!
|
| 33 |
+
unset -f deactivate
|
| 34 |
+
fi
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# unset irrelevant variables
|
| 38 |
+
deactivate nondestructive
|
| 39 |
+
|
| 40 |
+
VIRTUAL_ENV="/home/l.sottani/rag/preprocessing/venv"
|
| 41 |
+
export VIRTUAL_ENV
|
| 42 |
+
|
| 43 |
+
_OLD_VIRTUAL_PATH="$PATH"
|
| 44 |
+
PATH="$VIRTUAL_ENV/bin:$PATH"
|
| 45 |
+
export PATH
|
| 46 |
+
|
| 47 |
+
# unset PYTHONHOME if set
|
| 48 |
+
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
|
| 49 |
+
# could use `if (set -u; : $PYTHONHOME) ;` in bash
|
| 50 |
+
if [ -n "${PYTHONHOME:-}" ] ; then
|
| 51 |
+
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
|
| 52 |
+
unset PYTHONHOME
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
|
| 56 |
+
_OLD_VIRTUAL_PS1="${PS1:-}"
|
| 57 |
+
PS1="(venv) ${PS1:-}"
|
| 58 |
+
export PS1
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
# This should detect bash and zsh, which have a hash command that must
|
| 62 |
+
# be called to get it to forget past commands. Without forgetting
|
| 63 |
+
# past commands the $PATH changes we made may not be respected
|
| 64 |
+
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
| 65 |
+
hash -r 2> /dev/null
|
| 66 |
+
fi
|
venv/bin/activate.csh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file must be used with "source bin/activate.csh" *from csh*.
|
| 2 |
+
# You cannot run it directly.
|
| 3 |
+
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
| 4 |
+
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
| 5 |
+
|
| 6 |
+
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; test "\!:*" != "nondestructive" && unalias deactivate'
|
| 7 |
+
|
| 8 |
+
# Unset irrelevant variables.
|
| 9 |
+
deactivate nondestructive
|
| 10 |
+
|
| 11 |
+
setenv VIRTUAL_ENV "/home/l.sottani/rag/preprocessing/venv"
|
| 12 |
+
|
| 13 |
+
set _OLD_VIRTUAL_PATH="$PATH"
|
| 14 |
+
setenv PATH "$VIRTUAL_ENV/bin:$PATH"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
set _OLD_VIRTUAL_PROMPT="$prompt"
|
| 18 |
+
|
| 19 |
+
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
| 20 |
+
set prompt = "(venv) $prompt"
|
| 21 |
+
endif
|
| 22 |
+
|
| 23 |
+
alias pydoc python -m pydoc
|
| 24 |
+
|
| 25 |
+
rehash
|
venv/bin/activate.fish
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
| 2 |
+
# (https://fishshell.com/); you cannot run it directly.
|
| 3 |
+
|
| 4 |
+
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
| 5 |
+
# reset old environment variables
|
| 6 |
+
if test -n "$_OLD_VIRTUAL_PATH"
|
| 7 |
+
set -gx PATH $_OLD_VIRTUAL_PATH
|
| 8 |
+
set -e _OLD_VIRTUAL_PATH
|
| 9 |
+
end
|
| 10 |
+
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
| 11 |
+
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
| 12 |
+
set -e _OLD_VIRTUAL_PYTHONHOME
|
| 13 |
+
end
|
| 14 |
+
|
| 15 |
+
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
| 16 |
+
functions -e fish_prompt
|
| 17 |
+
set -e _OLD_FISH_PROMPT_OVERRIDE
|
| 18 |
+
functions -c _old_fish_prompt fish_prompt
|
| 19 |
+
functions -e _old_fish_prompt
|
| 20 |
+
end
|
| 21 |
+
|
| 22 |
+
set -e VIRTUAL_ENV
|
| 23 |
+
if test "$argv[1]" != "nondestructive"
|
| 24 |
+
# Self-destruct!
|
| 25 |
+
functions -e deactivate
|
| 26 |
+
end
|
| 27 |
+
end
|
| 28 |
+
|
| 29 |
+
# Unset irrelevant variables.
|
| 30 |
+
deactivate nondestructive
|
| 31 |
+
|
| 32 |
+
set -gx VIRTUAL_ENV "/home/l.sottani/rag/preprocessing/venv"
|
| 33 |
+
|
| 34 |
+
set -gx _OLD_VIRTUAL_PATH $PATH
|
| 35 |
+
set -gx PATH "$VIRTUAL_ENV/bin" $PATH
|
| 36 |
+
|
| 37 |
+
# Unset PYTHONHOME if set.
|
| 38 |
+
if set -q PYTHONHOME
|
| 39 |
+
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
| 40 |
+
set -e PYTHONHOME
|
| 41 |
+
end
|
| 42 |
+
|
| 43 |
+
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
| 44 |
+
# fish uses a function instead of an env var to generate the prompt.
|
| 45 |
+
|
| 46 |
+
# Save the current fish_prompt function as the function _old_fish_prompt.
|
| 47 |
+
functions -c fish_prompt _old_fish_prompt
|
| 48 |
+
|
| 49 |
+
# With the original prompt function renamed, we can override with our own.
|
| 50 |
+
function fish_prompt
|
| 51 |
+
# Save the return status of the last command.
|
| 52 |
+
set -l old_status $status
|
| 53 |
+
|
| 54 |
+
# Output the venv prompt; color taken from the blue of the Python logo.
|
| 55 |
+
printf "%s%s%s" (set_color 4B8BBE) "(venv) " (set_color normal)
|
| 56 |
+
|
| 57 |
+
# Restore the return status of the previous command.
|
| 58 |
+
echo "exit $old_status" | .
|
| 59 |
+
# Output the original/"old" prompt.
|
| 60 |
+
_old_fish_prompt
|
| 61 |
+
end
|
| 62 |
+
|
| 63 |
+
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
| 64 |
+
end
|
venv/bin/chardetect
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from chardet.cli.chardetect import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/dumppdf.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
"""Extract pdf structure in XML format"""
|
| 3 |
+
|
| 4 |
+
import logging
|
| 5 |
+
import os.path
|
| 6 |
+
import re
|
| 7 |
+
import sys
|
| 8 |
+
from argparse import ArgumentParser
|
| 9 |
+
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
|
| 10 |
+
|
| 11 |
+
import pdfminer
|
| 12 |
+
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback
|
| 13 |
+
from pdfminer.pdfexceptions import (
|
| 14 |
+
PDFIOError,
|
| 15 |
+
PDFObjectNotFound,
|
| 16 |
+
PDFTypeError,
|
| 17 |
+
PDFValueError,
|
| 18 |
+
)
|
| 19 |
+
from pdfminer.pdfpage import PDFPage
|
| 20 |
+
from pdfminer.pdfparser import PDFParser
|
| 21 |
+
from pdfminer.pdftypes import PDFObjRef, PDFStream, resolve1, stream_value
|
| 22 |
+
from pdfminer.psparser import LIT, PSKeyword, PSLiteral
|
| 23 |
+
from pdfminer.utils import isnumber
|
| 24 |
+
|
| 25 |
+
logging.basicConfig()
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def escape(s: Union[str, bytes]) -> str:
|
| 32 |
+
if isinstance(s, bytes):
|
| 33 |
+
us = str(s, "latin-1")
|
| 34 |
+
else:
|
| 35 |
+
us = s
|
| 36 |
+
return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
|
| 40 |
+
if obj is None:
|
| 41 |
+
out.write("<null />")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
if isinstance(obj, dict):
|
| 45 |
+
out.write('<dict size="%d">\n' % len(obj))
|
| 46 |
+
for k, v in obj.items():
|
| 47 |
+
out.write("<key>%s</key>\n" % k)
|
| 48 |
+
out.write("<value>")
|
| 49 |
+
dumpxml(out, v)
|
| 50 |
+
out.write("</value>\n")
|
| 51 |
+
out.write("</dict>")
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
if isinstance(obj, list):
|
| 55 |
+
out.write('<list size="%d">\n' % len(obj))
|
| 56 |
+
for v in obj:
|
| 57 |
+
dumpxml(out, v)
|
| 58 |
+
out.write("\n")
|
| 59 |
+
out.write("</list>")
|
| 60 |
+
return
|
| 61 |
+
|
| 62 |
+
if isinstance(obj, (str, bytes)):
|
| 63 |
+
out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
+
if isinstance(obj, PDFStream):
|
| 67 |
+
if codec == "raw":
|
| 68 |
+
# Bug: writing bytes to text I/O. This will raise TypeError.
|
| 69 |
+
out.write(obj.get_rawdata()) # type: ignore [arg-type]
|
| 70 |
+
elif codec == "binary":
|
| 71 |
+
# Bug: writing bytes to text I/O. This will raise TypeError.
|
| 72 |
+
out.write(obj.get_data()) # type: ignore [arg-type]
|
| 73 |
+
else:
|
| 74 |
+
out.write("<stream>\n<props>\n")
|
| 75 |
+
dumpxml(out, obj.attrs)
|
| 76 |
+
out.write("\n</props>\n")
|
| 77 |
+
if codec == "text":
|
| 78 |
+
data = obj.get_data()
|
| 79 |
+
out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
|
| 80 |
+
out.write("</stream>")
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
if isinstance(obj, PDFObjRef):
|
| 84 |
+
out.write('<ref id="%d" />' % obj.objid)
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
if isinstance(obj, PSKeyword):
|
| 88 |
+
# Likely bug: obj.name is bytes, not str
|
| 89 |
+
out.write("<keyword>%s</keyword>" % obj.name) # type: ignore [str-bytes-safe]
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
if isinstance(obj, PSLiteral):
|
| 93 |
+
# Likely bug: obj.name may be bytes, not str
|
| 94 |
+
out.write("<literal>%s</literal>" % obj.name) # type: ignore [str-bytes-safe]
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
if isnumber(obj):
|
| 98 |
+
out.write("<number>%s</number>" % obj)
|
| 99 |
+
return
|
| 100 |
+
|
| 101 |
+
raise PDFTypeError(obj)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def dumptrailers(
|
| 105 |
+
out: TextIO,
|
| 106 |
+
doc: PDFDocument,
|
| 107 |
+
show_fallback_xref: bool = False,
|
| 108 |
+
) -> None:
|
| 109 |
+
for xref in doc.xrefs:
|
| 110 |
+
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
|
| 111 |
+
out.write("<trailer>\n")
|
| 112 |
+
dumpxml(out, xref.get_trailer())
|
| 113 |
+
out.write("\n</trailer>\n\n")
|
| 114 |
+
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
|
| 115 |
+
if no_xrefs and not show_fallback_xref:
|
| 116 |
+
msg = (
|
| 117 |
+
"This PDF does not have an xref. Use --show-fallback-xref if "
|
| 118 |
+
"you want to display the content of a fallback xref that "
|
| 119 |
+
"contains all objects."
|
| 120 |
+
)
|
| 121 |
+
logger.warning(msg)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def dumpallobjs(
|
| 125 |
+
out: TextIO,
|
| 126 |
+
doc: PDFDocument,
|
| 127 |
+
codec: Optional[str] = None,
|
| 128 |
+
show_fallback_xref: bool = False,
|
| 129 |
+
) -> None:
|
| 130 |
+
visited = set()
|
| 131 |
+
out.write("<pdf>")
|
| 132 |
+
for xref in doc.xrefs:
|
| 133 |
+
for objid in xref.get_objids():
|
| 134 |
+
if objid in visited:
|
| 135 |
+
continue
|
| 136 |
+
visited.add(objid)
|
| 137 |
+
try:
|
| 138 |
+
obj = doc.getobj(objid)
|
| 139 |
+
if obj is None:
|
| 140 |
+
continue
|
| 141 |
+
out.write('<object id="%d">\n' % objid)
|
| 142 |
+
dumpxml(out, obj, codec=codec)
|
| 143 |
+
out.write("\n</object>\n\n")
|
| 144 |
+
except PDFObjectNotFound as e:
|
| 145 |
+
print("not found: %r" % e)
|
| 146 |
+
dumptrailers(out, doc, show_fallback_xref)
|
| 147 |
+
out.write("</pdf>")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def dumpoutline(
|
| 151 |
+
outfp: TextIO,
|
| 152 |
+
fname: str,
|
| 153 |
+
objids: Any,
|
| 154 |
+
pagenos: Container[int],
|
| 155 |
+
password: str = "",
|
| 156 |
+
dumpall: bool = False,
|
| 157 |
+
codec: Optional[str] = None,
|
| 158 |
+
extractdir: Optional[str] = None,
|
| 159 |
+
) -> None:
|
| 160 |
+
fp = open(fname, "rb")
|
| 161 |
+
parser = PDFParser(fp)
|
| 162 |
+
doc = PDFDocument(parser, password)
|
| 163 |
+
pages = {
|
| 164 |
+
page.pageid: pageno
|
| 165 |
+
for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
def resolve_dest(dest: object) -> Any:
|
| 169 |
+
if isinstance(dest, (str, bytes)):
|
| 170 |
+
dest = resolve1(doc.get_dest(dest))
|
| 171 |
+
elif isinstance(dest, PSLiteral):
|
| 172 |
+
dest = resolve1(doc.get_dest(dest.name))
|
| 173 |
+
if isinstance(dest, dict):
|
| 174 |
+
dest = dest["D"]
|
| 175 |
+
if isinstance(dest, PDFObjRef):
|
| 176 |
+
dest = dest.resolve()
|
| 177 |
+
return dest
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
outlines = doc.get_outlines()
|
| 181 |
+
outfp.write("<outlines>\n")
|
| 182 |
+
for level, title, dest, a, se in outlines:
|
| 183 |
+
pageno = None
|
| 184 |
+
if dest:
|
| 185 |
+
dest = resolve_dest(dest)
|
| 186 |
+
pageno = pages[dest[0].objid]
|
| 187 |
+
elif a:
|
| 188 |
+
action = a
|
| 189 |
+
if isinstance(action, dict):
|
| 190 |
+
subtype = action.get("S")
|
| 191 |
+
if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
|
| 192 |
+
dest = resolve_dest(action["D"])
|
| 193 |
+
pageno = pages[dest[0].objid]
|
| 194 |
+
s = escape(title)
|
| 195 |
+
outfp.write(f'<outline level="{level!r}" title="{s}">\n')
|
| 196 |
+
if dest is not None:
|
| 197 |
+
outfp.write("<dest>")
|
| 198 |
+
dumpxml(outfp, dest)
|
| 199 |
+
outfp.write("</dest>\n")
|
| 200 |
+
if pageno is not None:
|
| 201 |
+
outfp.write("<pageno>%r</pageno>\n" % pageno)
|
| 202 |
+
outfp.write("</outline>\n")
|
| 203 |
+
outfp.write("</outlines>\n")
|
| 204 |
+
except PDFNoOutlines:
|
| 205 |
+
pass
|
| 206 |
+
parser.close()
|
| 207 |
+
fp.close()
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
LITERAL_FILESPEC = LIT("Filespec")
|
| 211 |
+
LITERAL_EMBEDDEDFILE = LIT("EmbeddedFile")
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def extractembedded(fname: str, password: str, extractdir: str) -> None:
|
| 215 |
+
def extract1(objid: int, obj: Dict[str, Any]) -> None:
|
| 216 |
+
filename = os.path.basename(obj.get("UF") or cast(bytes, obj.get("F")).decode())
|
| 217 |
+
fileref = obj["EF"].get("UF") or obj["EF"].get("F")
|
| 218 |
+
fileobj = doc.getobj(fileref.objid)
|
| 219 |
+
if not isinstance(fileobj, PDFStream):
|
| 220 |
+
error_msg = (
|
| 221 |
+
"unable to process PDF: reference for %r is not a "
|
| 222 |
+
"PDFStream" % filename
|
| 223 |
+
)
|
| 224 |
+
raise PDFValueError(error_msg)
|
| 225 |
+
if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE:
|
| 226 |
+
raise PDFValueError(
|
| 227 |
+
"unable to process PDF: reference for %r "
|
| 228 |
+
"is not an EmbeddedFile" % (filename),
|
| 229 |
+
)
|
| 230 |
+
path = os.path.join(extractdir, "%.6d-%s" % (objid, filename))
|
| 231 |
+
if os.path.exists(path):
|
| 232 |
+
raise PDFIOError("file exists: %r" % path)
|
| 233 |
+
print("extracting: %r" % path)
|
| 234 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 235 |
+
out = open(path, "wb")
|
| 236 |
+
out.write(fileobj.get_data())
|
| 237 |
+
out.close()
|
| 238 |
+
|
| 239 |
+
with open(fname, "rb") as fp:
|
| 240 |
+
parser = PDFParser(fp)
|
| 241 |
+
doc = PDFDocument(parser, password)
|
| 242 |
+
extracted_objids = set()
|
| 243 |
+
for xref in doc.xrefs:
|
| 244 |
+
for objid in xref.get_objids():
|
| 245 |
+
obj = doc.getobj(objid)
|
| 246 |
+
if (
|
| 247 |
+
objid not in extracted_objids
|
| 248 |
+
and isinstance(obj, dict)
|
| 249 |
+
and obj.get("Type") is LITERAL_FILESPEC
|
| 250 |
+
):
|
| 251 |
+
extracted_objids.add(objid)
|
| 252 |
+
extract1(objid, obj)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def dumppdf(
|
| 256 |
+
outfp: TextIO,
|
| 257 |
+
fname: str,
|
| 258 |
+
objids: Iterable[int],
|
| 259 |
+
pagenos: Container[int],
|
| 260 |
+
password: str = "",
|
| 261 |
+
dumpall: bool = False,
|
| 262 |
+
codec: Optional[str] = None,
|
| 263 |
+
extractdir: Optional[str] = None,
|
| 264 |
+
show_fallback_xref: bool = False,
|
| 265 |
+
) -> None:
|
| 266 |
+
fp = open(fname, "rb")
|
| 267 |
+
parser = PDFParser(fp)
|
| 268 |
+
doc = PDFDocument(parser, password)
|
| 269 |
+
if objids:
|
| 270 |
+
for objid in objids:
|
| 271 |
+
obj = doc.getobj(objid)
|
| 272 |
+
dumpxml(outfp, obj, codec=codec)
|
| 273 |
+
if pagenos:
|
| 274 |
+
for pageno, page in enumerate(PDFPage.create_pages(doc)):
|
| 275 |
+
if pageno in pagenos:
|
| 276 |
+
if codec:
|
| 277 |
+
for obj in page.contents:
|
| 278 |
+
obj = stream_value(obj)
|
| 279 |
+
dumpxml(outfp, obj, codec=codec)
|
| 280 |
+
else:
|
| 281 |
+
dumpxml(outfp, page.attrs)
|
| 282 |
+
if dumpall:
|
| 283 |
+
dumpallobjs(outfp, doc, codec, show_fallback_xref)
|
| 284 |
+
if (not objids) and (not pagenos) and (not dumpall):
|
| 285 |
+
dumptrailers(outfp, doc, show_fallback_xref)
|
| 286 |
+
fp.close()
|
| 287 |
+
if codec not in ("raw", "binary"):
|
| 288 |
+
outfp.write("\n")
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def create_parser() -> ArgumentParser:
|
| 292 |
+
parser = ArgumentParser(description=__doc__, add_help=True)
|
| 293 |
+
parser.add_argument(
|
| 294 |
+
"files",
|
| 295 |
+
type=str,
|
| 296 |
+
default=None,
|
| 297 |
+
nargs="+",
|
| 298 |
+
help="One or more paths to PDF files.",
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
parser.add_argument(
|
| 302 |
+
"--version",
|
| 303 |
+
"-v",
|
| 304 |
+
action="version",
|
| 305 |
+
version=f"pdfminer.six v{pdfminer.__version__}",
|
| 306 |
+
)
|
| 307 |
+
parser.add_argument(
|
| 308 |
+
"--debug",
|
| 309 |
+
"-d",
|
| 310 |
+
default=False,
|
| 311 |
+
action="store_true",
|
| 312 |
+
help="Use debug logging level.",
|
| 313 |
+
)
|
| 314 |
+
procedure_parser = parser.add_mutually_exclusive_group()
|
| 315 |
+
procedure_parser.add_argument(
|
| 316 |
+
"--extract-toc",
|
| 317 |
+
"-T",
|
| 318 |
+
default=False,
|
| 319 |
+
action="store_true",
|
| 320 |
+
help="Extract structure of outline",
|
| 321 |
+
)
|
| 322 |
+
procedure_parser.add_argument(
|
| 323 |
+
"--extract-embedded",
|
| 324 |
+
"-E",
|
| 325 |
+
type=str,
|
| 326 |
+
help="Extract embedded files",
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
parse_params = parser.add_argument_group(
|
| 330 |
+
"Parser",
|
| 331 |
+
description="Used during PDF parsing",
|
| 332 |
+
)
|
| 333 |
+
parse_params.add_argument(
|
| 334 |
+
"--page-numbers",
|
| 335 |
+
type=int,
|
| 336 |
+
default=None,
|
| 337 |
+
nargs="+",
|
| 338 |
+
help="A space-seperated list of page numbers to parse.",
|
| 339 |
+
)
|
| 340 |
+
parse_params.add_argument(
|
| 341 |
+
"--pagenos",
|
| 342 |
+
"-p",
|
| 343 |
+
type=str,
|
| 344 |
+
help="A comma-separated list of page numbers to parse. Included for "
|
| 345 |
+
"legacy applications, use --page-numbers for more idiomatic "
|
| 346 |
+
"argument entry.",
|
| 347 |
+
)
|
| 348 |
+
parse_params.add_argument(
|
| 349 |
+
"--objects",
|
| 350 |
+
"-i",
|
| 351 |
+
type=str,
|
| 352 |
+
help="Comma separated list of object numbers to extract",
|
| 353 |
+
)
|
| 354 |
+
parse_params.add_argument(
|
| 355 |
+
"--all",
|
| 356 |
+
"-a",
|
| 357 |
+
default=False,
|
| 358 |
+
action="store_true",
|
| 359 |
+
help="If the structure of all objects should be extracted",
|
| 360 |
+
)
|
| 361 |
+
parse_params.add_argument(
|
| 362 |
+
"--show-fallback-xref",
|
| 363 |
+
action="store_true",
|
| 364 |
+
help="Additionally show the fallback xref. Use this if the PDF "
|
| 365 |
+
"has zero or only invalid xref's. This setting is ignored if "
|
| 366 |
+
"--extract-toc or --extract-embedded is used.",
|
| 367 |
+
)
|
| 368 |
+
parse_params.add_argument(
|
| 369 |
+
"--password",
|
| 370 |
+
"-P",
|
| 371 |
+
type=str,
|
| 372 |
+
default="",
|
| 373 |
+
help="The password to use for decrypting PDF file.",
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
output_params = parser.add_argument_group(
|
| 377 |
+
"Output",
|
| 378 |
+
description="Used during output generation.",
|
| 379 |
+
)
|
| 380 |
+
output_params.add_argument(
|
| 381 |
+
"--outfile",
|
| 382 |
+
"-o",
|
| 383 |
+
type=str,
|
| 384 |
+
default="-",
|
| 385 |
+
help='Path to file where output is written. Or "-" (default) to '
|
| 386 |
+
"write to stdout.",
|
| 387 |
+
)
|
| 388 |
+
codec_parser = output_params.add_mutually_exclusive_group()
|
| 389 |
+
codec_parser.add_argument(
|
| 390 |
+
"--raw-stream",
|
| 391 |
+
"-r",
|
| 392 |
+
default=False,
|
| 393 |
+
action="store_true",
|
| 394 |
+
help="Write stream objects without encoding",
|
| 395 |
+
)
|
| 396 |
+
codec_parser.add_argument(
|
| 397 |
+
"--binary-stream",
|
| 398 |
+
"-b",
|
| 399 |
+
default=False,
|
| 400 |
+
action="store_true",
|
| 401 |
+
help="Write stream objects with binary encoding",
|
| 402 |
+
)
|
| 403 |
+
codec_parser.add_argument(
|
| 404 |
+
"--text-stream",
|
| 405 |
+
"-t",
|
| 406 |
+
default=False,
|
| 407 |
+
action="store_true",
|
| 408 |
+
help="Write stream objects as plain text",
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
return parser
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def main(argv: Optional[List[str]] = None) -> None:
|
| 415 |
+
parser = create_parser()
|
| 416 |
+
args = parser.parse_args(args=argv)
|
| 417 |
+
|
| 418 |
+
if args.debug:
|
| 419 |
+
logging.getLogger().setLevel(logging.DEBUG)
|
| 420 |
+
|
| 421 |
+
if args.outfile == "-":
|
| 422 |
+
outfp = sys.stdout
|
| 423 |
+
else:
|
| 424 |
+
outfp = open(args.outfile, "w")
|
| 425 |
+
|
| 426 |
+
if args.objects:
|
| 427 |
+
objids = [int(x) for x in args.objects.split(",")]
|
| 428 |
+
else:
|
| 429 |
+
objids = []
|
| 430 |
+
|
| 431 |
+
if args.page_numbers:
|
| 432 |
+
pagenos = {x - 1 for x in args.page_numbers}
|
| 433 |
+
elif args.pagenos:
|
| 434 |
+
pagenos = {int(x) - 1 for x in args.pagenos.split(",")}
|
| 435 |
+
else:
|
| 436 |
+
pagenos = set()
|
| 437 |
+
|
| 438 |
+
password = args.password
|
| 439 |
+
|
| 440 |
+
if args.raw_stream:
|
| 441 |
+
codec: Optional[str] = "raw"
|
| 442 |
+
elif args.binary_stream:
|
| 443 |
+
codec = "binary"
|
| 444 |
+
elif args.text_stream:
|
| 445 |
+
codec = "text"
|
| 446 |
+
else:
|
| 447 |
+
codec = None
|
| 448 |
+
|
| 449 |
+
for fname in args.files:
|
| 450 |
+
if args.extract_toc:
|
| 451 |
+
dumpoutline(
|
| 452 |
+
outfp,
|
| 453 |
+
fname,
|
| 454 |
+
objids,
|
| 455 |
+
pagenos,
|
| 456 |
+
password=password,
|
| 457 |
+
dumpall=args.all,
|
| 458 |
+
codec=codec,
|
| 459 |
+
extractdir=None,
|
| 460 |
+
)
|
| 461 |
+
elif args.extract_embedded:
|
| 462 |
+
extractembedded(fname, password=password, extractdir=args.extract_embedded)
|
| 463 |
+
else:
|
| 464 |
+
dumppdf(
|
| 465 |
+
outfp,
|
| 466 |
+
fname,
|
| 467 |
+
objids,
|
| 468 |
+
pagenos,
|
| 469 |
+
password=password,
|
| 470 |
+
dumpall=args.all,
|
| 471 |
+
codec=codec,
|
| 472 |
+
extractdir=None,
|
| 473 |
+
show_fallback_xref=args.show_fallback_xref,
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
outfp.close()
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
if __name__ == "__main__":
|
| 480 |
+
main()
|
venv/bin/f2py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from numpy.f2py.f2py2e import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/fastapi
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from fastapi.cli import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/fonttools
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from fontTools.__main__ import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/gradio
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from gradio.cli import cli
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(cli())
|
venv/bin/hf
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from huggingface_hub.cli.hf import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/httpx
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from httpx import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/huggingface-cli
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from huggingface_hub.commands.huggingface_cli import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/markdown-it
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from markdown_it.cli.parse import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/markdownify
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from markdownify.main import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/normalizer
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from charset_normalizer.cli import cli_detect
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(cli_detect())
|
venv/bin/numpy-config
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from numpy._configtool import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pdf2txt.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
"""A command line tool for extracting text and images from PDF and
|
| 3 |
+
output it to plain text, html, xml or tags.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import logging
|
| 8 |
+
import sys
|
| 9 |
+
from typing import Any, Container, Iterable, List, Optional
|
| 10 |
+
|
| 11 |
+
import pdfminer.high_level
|
| 12 |
+
from pdfminer.layout import LAParams
|
| 13 |
+
from pdfminer.pdfexceptions import PDFValueError
|
| 14 |
+
from pdfminer.utils import AnyIO
|
| 15 |
+
|
| 16 |
+
logging.basicConfig()
|
| 17 |
+
|
| 18 |
+
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def float_or_disabled(x: str) -> Optional[float]:
|
| 22 |
+
if x.lower().strip() == "disabled":
|
| 23 |
+
return None
|
| 24 |
+
try:
|
| 25 |
+
return float(x)
|
| 26 |
+
except ValueError:
|
| 27 |
+
raise argparse.ArgumentTypeError(f"invalid float value: {x}")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def extract_text(
|
| 31 |
+
files: Iterable[str] = [],
|
| 32 |
+
outfile: str = "-",
|
| 33 |
+
laparams: Optional[LAParams] = None,
|
| 34 |
+
output_type: str = "text",
|
| 35 |
+
codec: str = "utf-8",
|
| 36 |
+
strip_control: bool = False,
|
| 37 |
+
maxpages: int = 0,
|
| 38 |
+
page_numbers: Optional[Container[int]] = None,
|
| 39 |
+
password: str = "",
|
| 40 |
+
scale: float = 1.0,
|
| 41 |
+
rotation: int = 0,
|
| 42 |
+
layoutmode: str = "normal",
|
| 43 |
+
output_dir: Optional[str] = None,
|
| 44 |
+
debug: bool = False,
|
| 45 |
+
disable_caching: bool = False,
|
| 46 |
+
**kwargs: Any,
|
| 47 |
+
) -> AnyIO:
|
| 48 |
+
if not files:
|
| 49 |
+
raise PDFValueError("Must provide files to work upon!")
|
| 50 |
+
|
| 51 |
+
if output_type == "text" and outfile != "-":
|
| 52 |
+
for override, alttype in OUTPUT_TYPES:
|
| 53 |
+
if outfile.endswith(override):
|
| 54 |
+
output_type = alttype
|
| 55 |
+
|
| 56 |
+
if outfile == "-":
|
| 57 |
+
outfp: AnyIO = sys.stdout
|
| 58 |
+
if sys.stdout.encoding is not None:
|
| 59 |
+
codec = "utf-8"
|
| 60 |
+
else:
|
| 61 |
+
outfp = open(outfile, "wb")
|
| 62 |
+
|
| 63 |
+
for fname in files:
|
| 64 |
+
with open(fname, "rb") as fp:
|
| 65 |
+
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
| 66 |
+
return outfp
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def create_parser() -> argparse.ArgumentParser:
|
| 70 |
+
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
| 71 |
+
parser.add_argument(
|
| 72 |
+
"files",
|
| 73 |
+
type=str,
|
| 74 |
+
default=None,
|
| 75 |
+
nargs="+",
|
| 76 |
+
help="One or more paths to PDF files.",
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--version",
|
| 81 |
+
"-v",
|
| 82 |
+
action="version",
|
| 83 |
+
version=f"pdfminer.six v{pdfminer.__version__}",
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
"--debug",
|
| 87 |
+
"-d",
|
| 88 |
+
default=False,
|
| 89 |
+
action="store_true",
|
| 90 |
+
help="Use debug logging level.",
|
| 91 |
+
)
|
| 92 |
+
parser.add_argument(
|
| 93 |
+
"--disable-caching",
|
| 94 |
+
"-C",
|
| 95 |
+
default=False,
|
| 96 |
+
action="store_true",
|
| 97 |
+
help="If caching or resources, such as fonts, should be disabled.",
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
parse_params = parser.add_argument_group(
|
| 101 |
+
"Parser",
|
| 102 |
+
description="Used during PDF parsing",
|
| 103 |
+
)
|
| 104 |
+
parse_params.add_argument(
|
| 105 |
+
"--page-numbers",
|
| 106 |
+
type=int,
|
| 107 |
+
default=None,
|
| 108 |
+
nargs="+",
|
| 109 |
+
help="A space-seperated list of page numbers to parse.",
|
| 110 |
+
)
|
| 111 |
+
parse_params.add_argument(
|
| 112 |
+
"--pagenos",
|
| 113 |
+
"-p",
|
| 114 |
+
type=str,
|
| 115 |
+
help="A comma-separated list of page numbers to parse. "
|
| 116 |
+
"Included for legacy applications, use --page-numbers "
|
| 117 |
+
"for more idiomatic argument entry.",
|
| 118 |
+
)
|
| 119 |
+
parse_params.add_argument(
|
| 120 |
+
"--maxpages",
|
| 121 |
+
"-m",
|
| 122 |
+
type=int,
|
| 123 |
+
default=0,
|
| 124 |
+
help="The maximum number of pages to parse.",
|
| 125 |
+
)
|
| 126 |
+
parse_params.add_argument(
|
| 127 |
+
"--password",
|
| 128 |
+
"-P",
|
| 129 |
+
type=str,
|
| 130 |
+
default="",
|
| 131 |
+
help="The password to use for decrypting PDF file.",
|
| 132 |
+
)
|
| 133 |
+
parse_params.add_argument(
|
| 134 |
+
"--rotation",
|
| 135 |
+
"-R",
|
| 136 |
+
default=0,
|
| 137 |
+
type=int,
|
| 138 |
+
help="The number of degrees to rotate the PDF "
|
| 139 |
+
"before other types of processing.",
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
la_params = LAParams() # will be used for defaults
|
| 143 |
+
la_param_group = parser.add_argument_group(
|
| 144 |
+
"Layout analysis",
|
| 145 |
+
description="Used during layout analysis.",
|
| 146 |
+
)
|
| 147 |
+
la_param_group.add_argument(
|
| 148 |
+
"--no-laparams",
|
| 149 |
+
"-n",
|
| 150 |
+
default=False,
|
| 151 |
+
action="store_true",
|
| 152 |
+
help="If layout analysis parameters should be ignored.",
|
| 153 |
+
)
|
| 154 |
+
la_param_group.add_argument(
|
| 155 |
+
"--detect-vertical",
|
| 156 |
+
"-V",
|
| 157 |
+
default=la_params.detect_vertical,
|
| 158 |
+
action="store_true",
|
| 159 |
+
help="If vertical text should be considered during layout analysis",
|
| 160 |
+
)
|
| 161 |
+
la_param_group.add_argument(
|
| 162 |
+
"--line-overlap",
|
| 163 |
+
type=float,
|
| 164 |
+
default=la_params.line_overlap,
|
| 165 |
+
help="If two characters have more overlap than this they "
|
| 166 |
+
"are considered to be on the same line. The overlap is specified "
|
| 167 |
+
"relative to the minimum height of both characters.",
|
| 168 |
+
)
|
| 169 |
+
la_param_group.add_argument(
|
| 170 |
+
"--char-margin",
|
| 171 |
+
"-M",
|
| 172 |
+
type=float,
|
| 173 |
+
default=la_params.char_margin,
|
| 174 |
+
help="If two characters are closer together than this margin they "
|
| 175 |
+
"are considered to be part of the same line. The margin is "
|
| 176 |
+
"specified relative to the width of the character.",
|
| 177 |
+
)
|
| 178 |
+
la_param_group.add_argument(
|
| 179 |
+
"--word-margin",
|
| 180 |
+
"-W",
|
| 181 |
+
type=float,
|
| 182 |
+
default=la_params.word_margin,
|
| 183 |
+
help="If two characters on the same line are further apart than this "
|
| 184 |
+
"margin then they are considered to be two separate words, and "
|
| 185 |
+
"an intermediate space will be added for readability. The margin "
|
| 186 |
+
"is specified relative to the width of the character.",
|
| 187 |
+
)
|
| 188 |
+
la_param_group.add_argument(
|
| 189 |
+
"--line-margin",
|
| 190 |
+
"-L",
|
| 191 |
+
type=float,
|
| 192 |
+
default=la_params.line_margin,
|
| 193 |
+
help="If two lines are close together they are considered to "
|
| 194 |
+
"be part of the same paragraph. The margin is specified "
|
| 195 |
+
"relative to the height of a line.",
|
| 196 |
+
)
|
| 197 |
+
la_param_group.add_argument(
|
| 198 |
+
"--boxes-flow",
|
| 199 |
+
"-F",
|
| 200 |
+
type=float_or_disabled,
|
| 201 |
+
default=la_params.boxes_flow,
|
| 202 |
+
help="Specifies how much a horizontal and vertical position of a "
|
| 203 |
+
"text matters when determining the order of lines. The value "
|
| 204 |
+
"should be within the range of -1.0 (only horizontal position "
|
| 205 |
+
"matters) to +1.0 (only vertical position matters). You can also "
|
| 206 |
+
"pass `disabled` to disable advanced layout analysis, and "
|
| 207 |
+
"instead return text based on the position of the bottom left "
|
| 208 |
+
"corner of the text box.",
|
| 209 |
+
)
|
| 210 |
+
la_param_group.add_argument(
|
| 211 |
+
"--all-texts",
|
| 212 |
+
"-A",
|
| 213 |
+
default=la_params.all_texts,
|
| 214 |
+
action="store_true",
|
| 215 |
+
help="If layout analysis should be performed on text in figures.",
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
output_params = parser.add_argument_group(
|
| 219 |
+
"Output",
|
| 220 |
+
description="Used during output generation.",
|
| 221 |
+
)
|
| 222 |
+
output_params.add_argument(
|
| 223 |
+
"--outfile",
|
| 224 |
+
"-o",
|
| 225 |
+
type=str,
|
| 226 |
+
default="-",
|
| 227 |
+
help="Path to file where output is written. "
|
| 228 |
+
'Or "-" (default) to write to stdout.',
|
| 229 |
+
)
|
| 230 |
+
output_params.add_argument(
|
| 231 |
+
"--output_type",
|
| 232 |
+
"-t",
|
| 233 |
+
type=str,
|
| 234 |
+
default="text",
|
| 235 |
+
help="Type of output to generate {text,html,xml,tag}.",
|
| 236 |
+
)
|
| 237 |
+
output_params.add_argument(
|
| 238 |
+
"--codec",
|
| 239 |
+
"-c",
|
| 240 |
+
type=str,
|
| 241 |
+
default="utf-8",
|
| 242 |
+
help="Text encoding to use in output file.",
|
| 243 |
+
)
|
| 244 |
+
output_params.add_argument(
|
| 245 |
+
"--output-dir",
|
| 246 |
+
"-O",
|
| 247 |
+
default=None,
|
| 248 |
+
help="The output directory to put extracted images in. If not given, "
|
| 249 |
+
"images are not extracted.",
|
| 250 |
+
)
|
| 251 |
+
output_params.add_argument(
|
| 252 |
+
"--layoutmode",
|
| 253 |
+
"-Y",
|
| 254 |
+
default="normal",
|
| 255 |
+
type=str,
|
| 256 |
+
help="Type of layout to use when generating html "
|
| 257 |
+
"{normal,exact,loose}. If normal,each line is"
|
| 258 |
+
" positioned separately in the html. If exact"
|
| 259 |
+
", each character is positioned separately in"
|
| 260 |
+
" the html. If loose, same result as normal "
|
| 261 |
+
"but with an additional newline after each "
|
| 262 |
+
"text line. Only used when output_type is html.",
|
| 263 |
+
)
|
| 264 |
+
output_params.add_argument(
|
| 265 |
+
"--scale",
|
| 266 |
+
"-s",
|
| 267 |
+
type=float,
|
| 268 |
+
default=1.0,
|
| 269 |
+
help="The amount of zoom to use when generating html file. "
|
| 270 |
+
"Only used when output_type is html.",
|
| 271 |
+
)
|
| 272 |
+
output_params.add_argument(
|
| 273 |
+
"--strip-control",
|
| 274 |
+
"-S",
|
| 275 |
+
default=False,
|
| 276 |
+
action="store_true",
|
| 277 |
+
help="Remove control statement from text. "
|
| 278 |
+
"Only used when output_type is xml.",
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
return parser
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
| 285 |
+
parsed_args = create_parser().parse_args(args=args)
|
| 286 |
+
|
| 287 |
+
# Propagate parsed layout parameters to LAParams object
|
| 288 |
+
if parsed_args.no_laparams:
|
| 289 |
+
parsed_args.laparams = None
|
| 290 |
+
else:
|
| 291 |
+
parsed_args.laparams = LAParams(
|
| 292 |
+
line_overlap=parsed_args.line_overlap,
|
| 293 |
+
char_margin=parsed_args.char_margin,
|
| 294 |
+
line_margin=parsed_args.line_margin,
|
| 295 |
+
word_margin=parsed_args.word_margin,
|
| 296 |
+
boxes_flow=parsed_args.boxes_flow,
|
| 297 |
+
detect_vertical=parsed_args.detect_vertical,
|
| 298 |
+
all_texts=parsed_args.all_texts,
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
if parsed_args.page_numbers:
|
| 302 |
+
parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers}
|
| 303 |
+
|
| 304 |
+
if parsed_args.pagenos:
|
| 305 |
+
parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
|
| 306 |
+
|
| 307 |
+
if parsed_args.output_type == "text" and parsed_args.outfile != "-":
|
| 308 |
+
for override, alttype in OUTPUT_TYPES:
|
| 309 |
+
if parsed_args.outfile.endswith(override):
|
| 310 |
+
parsed_args.output_type = alttype
|
| 311 |
+
|
| 312 |
+
return parsed_args
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def main(args: Optional[List[str]] = None) -> int:
|
| 316 |
+
parsed_args = parse_args(args)
|
| 317 |
+
outfp = extract_text(**vars(parsed_args))
|
| 318 |
+
outfp.close()
|
| 319 |
+
return 0
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
if __name__ == "__main__":
|
| 323 |
+
sys.exit(main())
|
venv/bin/pdfplumber
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pdfplumber.cli import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pip
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pip._internal.cli.main import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pip3
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pip._internal.cli.main import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pip3.9
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pip._internal.cli.main import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pyftmerge
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from fontTools.merge import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pyftsubset
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from fontTools.subset import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pygmentize
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pygments.cmdline import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/pypdfium2
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pypdfium2.__main__ import cli_main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(cli_main())
|
venv/bin/python
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c54ab0ae7c3b248e10f847477b01639de71d9b1d67aee809f1f4da196caca8cc
|
| 3 |
+
size 15880080
|
venv/bin/python3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c54ab0ae7c3b248e10f847477b01639de71d9b1d67aee809f1f4da196caca8cc
|
| 3 |
+
size 15880080
|
venv/bin/python3.9
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c54ab0ae7c3b248e10f847477b01639de71d9b1d67aee809f1f4da196caca8cc
|
| 3 |
+
size 15880080
|
venv/bin/ruff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5321df702f31419551f21b921c00fa3a5c8347ac399d0a779bdca4eca3daad9c
|
| 3 |
+
size 37014240
|
venv/bin/tiny-agents
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from huggingface_hub.inference._mcp.cli import app
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(app())
|
venv/bin/tqdm
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from tqdm.cli import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/ttx
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from fontTools.ttx import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/typer
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from typer.cli import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/upload_theme
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from gradio.themes.upload_theme import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/bin/uvicorn
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/home/l.sottani/rag/preprocessing/venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from uvicorn.main import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
venv/lib/python3.9/site-packages/MarkupSafe-2.1.5.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/lib/python3.9/site-packages/MarkupSafe-2.1.5.dist-info/LICENSE.rst
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright 2010 Pallets
|
| 2 |
+
|
| 3 |
+
Redistribution and use in source and binary forms, with or without
|
| 4 |
+
modification, are permitted provided that the following conditions are
|
| 5 |
+
met:
|
| 6 |
+
|
| 7 |
+
1. Redistributions of source code must retain the above copyright
|
| 8 |
+
notice, this list of conditions and the following disclaimer.
|
| 9 |
+
|
| 10 |
+
2. Redistributions in binary form must reproduce the above copyright
|
| 11 |
+
notice, this list of conditions and the following disclaimer in the
|
| 12 |
+
documentation and/or other materials provided with the distribution.
|
| 13 |
+
|
| 14 |
+
3. Neither the name of the copyright holder nor the names of its
|
| 15 |
+
contributors may be used to endorse or promote products derived from
|
| 16 |
+
this software without specific prior written permission.
|
| 17 |
+
|
| 18 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 19 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 20 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
| 21 |
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 22 |
+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 23 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
| 24 |
+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 25 |
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
| 26 |
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
| 27 |
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
| 28 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
venv/lib/python3.9/site-packages/MarkupSafe-2.1.5.dist-info/METADATA
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: MarkupSafe
|
| 3 |
+
Version: 2.1.5
|
| 4 |
+
Summary: Safely add untrusted strings to HTML/XML markup.
|
| 5 |
+
Home-page: https://palletsprojects.com/p/markupsafe/
|
| 6 |
+
Maintainer: Pallets
|
| 7 |
+
Maintainer-email: contact@palletsprojects.com
|
| 8 |
+
License: BSD-3-Clause
|
| 9 |
+
Project-URL: Donate, https://palletsprojects.com/donate
|
| 10 |
+
Project-URL: Documentation, https://markupsafe.palletsprojects.com/
|
| 11 |
+
Project-URL: Changes, https://markupsafe.palletsprojects.com/changes/
|
| 12 |
+
Project-URL: Source Code, https://github.com/pallets/markupsafe/
|
| 13 |
+
Project-URL: Issue Tracker, https://github.com/pallets/markupsafe/issues/
|
| 14 |
+
Project-URL: Chat, https://discord.gg/pallets
|
| 15 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 16 |
+
Classifier: Environment :: Web Environment
|
| 17 |
+
Classifier: Intended Audience :: Developers
|
| 18 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 19 |
+
Classifier: Operating System :: OS Independent
|
| 20 |
+
Classifier: Programming Language :: Python
|
| 21 |
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
| 22 |
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
| 23 |
+
Requires-Python: >=3.7
|
| 24 |
+
Description-Content-Type: text/x-rst
|
| 25 |
+
License-File: LICENSE.rst
|
| 26 |
+
|
| 27 |
+
MarkupSafe
|
| 28 |
+
==========
|
| 29 |
+
|
| 30 |
+
MarkupSafe implements a text object that escapes characters so it is
|
| 31 |
+
safe to use in HTML and XML. Characters that have special meanings are
|
| 32 |
+
replaced so that they display as the actual characters. This mitigates
|
| 33 |
+
injection attacks, meaning untrusted user input can safely be displayed
|
| 34 |
+
on a page.
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
Installing
|
| 38 |
+
----------
|
| 39 |
+
|
| 40 |
+
Install and update using `pip`_:
|
| 41 |
+
|
| 42 |
+
.. code-block:: text
|
| 43 |
+
|
| 44 |
+
pip install -U MarkupSafe
|
| 45 |
+
|
| 46 |
+
.. _pip: https://pip.pypa.io/en/stable/getting-started/
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
Examples
|
| 50 |
+
--------
|
| 51 |
+
|
| 52 |
+
.. code-block:: pycon
|
| 53 |
+
|
| 54 |
+
>>> from markupsafe import Markup, escape
|
| 55 |
+
|
| 56 |
+
>>> # escape replaces special characters and wraps in Markup
|
| 57 |
+
>>> escape("<script>alert(document.cookie);</script>")
|
| 58 |
+
Markup('<script>alert(document.cookie);</script>')
|
| 59 |
+
|
| 60 |
+
>>> # wrap in Markup to mark text "safe" and prevent escaping
|
| 61 |
+
>>> Markup("<strong>Hello</strong>")
|
| 62 |
+
Markup('<strong>hello</strong>')
|
| 63 |
+
|
| 64 |
+
>>> escape(Markup("<strong>Hello</strong>"))
|
| 65 |
+
Markup('<strong>hello</strong>')
|
| 66 |
+
|
| 67 |
+
>>> # Markup is a str subclass
|
| 68 |
+
>>> # methods and operators escape their arguments
|
| 69 |
+
>>> template = Markup("Hello <em>{name}</em>")
|
| 70 |
+
>>> template.format(name='"World"')
|
| 71 |
+
Markup('Hello <em>"World"</em>')
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
Donate
|
| 75 |
+
------
|
| 76 |
+
|
| 77 |
+
The Pallets organization develops and supports MarkupSafe and other
|
| 78 |
+
popular packages. In order to grow the community of contributors and
|
| 79 |
+
users, and allow the maintainers to devote more time to the projects,
|
| 80 |
+
`please donate today`_.
|
| 81 |
+
|
| 82 |
+
.. _please donate today: https://palletsprojects.com/donate
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
Links
|
| 86 |
+
-----
|
| 87 |
+
|
| 88 |
+
- Documentation: https://markupsafe.palletsprojects.com/
|
| 89 |
+
- Changes: https://markupsafe.palletsprojects.com/changes/
|
| 90 |
+
- PyPI Releases: https://pypi.org/project/MarkupSafe/
|
| 91 |
+
- Source Code: https://github.com/pallets/markupsafe/
|
| 92 |
+
- Issue Tracker: https://github.com/pallets/markupsafe/issues/
|
| 93 |
+
- Chat: https://discord.gg/pallets
|