fix RabbitMQ
Browse files- __pycache__/topic_extraction.cpython-310.pyc +0 -0
- p_ms.PDF +0 -0
- pearson_json/_subtopics.json +39 -117
- topic_extraction.log +552 -0
- topic_extraction.py +52 -36
- worker.py +1 -0
__pycache__/topic_extraction.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ
|
|
|
p_ms.PDF
DELETED
|
Binary file (291 kB)
|
|
|
pearson_json/_subtopics.json
CHANGED
|
@@ -6,6 +6,10 @@
|
|
| 6 |
"type": "image",
|
| 7 |
"key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
|
| 8 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
{
|
| 10 |
"type": "image",
|
| 11 |
"key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
|
|
@@ -24,177 +28,95 @@
|
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"type": "image",
|
| 27 |
-
"key": "/topic-extraction/cells/
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"type": "image",
|
| 31 |
-
"key": "/topic-extraction/cells/
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"type": "image",
|
| 35 |
-
"key": "/topic-extraction/cells/
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"type": "image",
|
| 39 |
-
"key": "/topic-extraction/cells/
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"type": "image",
|
| 43 |
-
"key": "/topic-extraction/cells/
|
| 44 |
},
|
| 45 |
{
|
| 46 |
"type": "image",
|
| 47 |
-
"key": "/topic-extraction/cells/
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"type": "image",
|
| 51 |
-
"key": "/topic-extraction/cells/
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"type": "image",
|
| 55 |
-
"key": "/topic-extraction/cells/
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"type": "image",
|
| 59 |
-
"key": "/topic-extraction/cells/
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"type": "image",
|
| 63 |
-
"key": "/topic-extraction/cells/
|
| 64 |
},
|
| 65 |
{
|
| 66 |
"type": "image",
|
| 67 |
-
"key": "/topic-extraction/cells/
|
| 68 |
-
}
|
| 69 |
-
],
|
| 70 |
-
"children": []
|
| 71 |
-
},
|
| 72 |
-
{
|
| 73 |
-
"title": "Factors influencing demand and supply in product markets",
|
| 74 |
-
"contents": [
|
| 75 |
-
{
|
| 76 |
-
"type": "image",
|
| 77 |
-
"key": "/topic-extraction/cells/img_2.jpg_r1_c0.png"
|
| 78 |
-
}
|
| 79 |
-
],
|
| 80 |
-
"children": []
|
| 81 |
-
},
|
| 82 |
-
{
|
| 83 |
-
"title": "Why and how governments intervene in markets",
|
| 84 |
-
"contents": [
|
| 85 |
{
|
| 86 |
"type": "image",
|
| 87 |
-
"key": "/topic-extraction/cells/
|
| 88 |
-
}
|
| 89 |
-
],
|
| 90 |
-
"children": []
|
| 91 |
-
},
|
| 92 |
-
{
|
| 93 |
-
"title": "The circular flow of income model",
|
| 94 |
-
"contents": [
|
| 95 |
{
|
| 96 |
"type": "image",
|
| 97 |
-
"key": "/topic-extraction/cells/
|
| 98 |
-
}
|
| 99 |
-
],
|
| 100 |
-
"children": []
|
| 101 |
-
},
|
| 102 |
-
{
|
| 103 |
-
"title": "Government policy objectives",
|
| 104 |
-
"contents": [
|
| 105 |
{
|
| 106 |
"type": "image",
|
| 107 |
-
"key": "/topic-extraction/cells/
|
| 108 |
-
}
|
| 109 |
-
],
|
| 110 |
-
"children": []
|
| 111 |
-
},
|
| 112 |
-
{
|
| 113 |
-
"title": "Fiscal policy",
|
| 114 |
-
"contents": [
|
| 115 |
{
|
| 116 |
"type": "image",
|
| 117 |
-
"key": "/topic-extraction/cells/
|
| 118 |
-
}
|
| 119 |
-
],
|
| 120 |
-
"children": []
|
| 121 |
-
},
|
| 122 |
-
{
|
| 123 |
-
"title": "Monetary policy",
|
| 124 |
-
"contents": [
|
| 125 |
{
|
| 126 |
"type": "image",
|
| 127 |
-
"key": "/topic-extraction/cells/
|
| 128 |
-
}
|
| 129 |
-
],
|
| 130 |
-
"children": []
|
| 131 |
-
},
|
| 132 |
-
{
|
| 133 |
-
"title": "Exchange rates and exchange rate policy",
|
| 134 |
-
"contents": [
|
| 135 |
{
|
| 136 |
"type": "image",
|
| 137 |
-
"key": "/topic-extraction/cells/
|
| 138 |
-
}
|
| 139 |
-
],
|
| 140 |
-
"children": []
|
| 141 |
-
},
|
| 142 |
-
{
|
| 143 |
-
"title": "Free trade and protectionism",
|
| 144 |
-
"contents": [
|
| 145 |
{
|
| 146 |
"type": "image",
|
| 147 |
-
"key": "/topic-extraction/cells/
|
| 148 |
-
}
|
| 149 |
-
],
|
| 150 |
-
"children": []
|
| 151 |
-
},
|
| 152 |
-
{
|
| 153 |
-
"title": "Monopoly",
|
| 154 |
-
"contents": [
|
| 155 |
{
|
| 156 |
"type": "image",
|
| 157 |
-
"key": "/topic-extraction/cells/
|
| 158 |
-
}
|
| 159 |
-
],
|
| 160 |
-
"children": []
|
| 161 |
-
},
|
| 162 |
-
{
|
| 163 |
-
"title": "Economic growth",
|
| 164 |
-
"contents": [
|
| 165 |
{
|
| 166 |
"type": "image",
|
| 167 |
-
"key": "/topic-extraction/cells/
|
| 168 |
-
}
|
| 169 |
-
],
|
| 170 |
-
"children": []
|
| 171 |
-
},
|
| 172 |
-
{
|
| 173 |
-
"title": "Inflation and deflation",
|
| 174 |
-
"contents": [
|
| 175 |
{
|
| 176 |
"type": "image",
|
| 177 |
-
"key": "/topic-extraction/cells/
|
| 178 |
-
}
|
| 179 |
-
],
|
| 180 |
-
"children": []
|
| 181 |
-
},
|
| 182 |
-
{
|
| 183 |
-
"title": "The balance of payments",
|
| 184 |
-
"contents": [
|
| 185 |
{
|
| 186 |
"type": "image",
|
| 187 |
-
"key": "/topic-extraction/cells/
|
| 188 |
-
}
|
| 189 |
-
],
|
| 190 |
-
"children": []
|
| 191 |
-
},
|
| 192 |
-
{
|
| 193 |
-
"title": "Control of the national (public sector) debt",
|
| 194 |
-
"contents": [
|
| 195 |
{
|
| 196 |
"type": "image",
|
| 197 |
-
"key": "/topic-extraction/cells/
|
| 198 |
}
|
| 199 |
],
|
| 200 |
"children": []
|
|
|
|
| 6 |
"type": "image",
|
| 7 |
"key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
|
| 8 |
},
|
| 9 |
+
{
|
| 10 |
+
"type": "image",
|
| 11 |
+
"key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
|
| 12 |
+
},
|
| 13 |
{
|
| 14 |
"type": "image",
|
| 15 |
"key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
|
|
|
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"type": "image",
|
| 31 |
+
"key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"type": "image",
|
| 35 |
+
"key": "/topic-extraction/cells/img_8.jpg_r1_c0.png"
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"type": "image",
|
| 39 |
+
"key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"type": "image",
|
| 43 |
+
"key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
|
| 44 |
},
|
| 45 |
{
|
| 46 |
"type": "image",
|
| 47 |
+
"key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"type": "image",
|
| 51 |
+
"key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"type": "image",
|
| 55 |
+
"key": "/topic-extraction/cells/img_13.jpg_r0_c1.png"
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"type": "image",
|
| 59 |
+
"key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"type": "image",
|
| 63 |
+
"key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
|
| 64 |
},
|
| 65 |
{
|
| 66 |
"type": "image",
|
| 67 |
+
"key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"type": "image",
|
| 71 |
+
"key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
|
| 72 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
{
|
| 74 |
"type": "image",
|
| 75 |
+
"key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
|
| 76 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
{
|
| 78 |
"type": "image",
|
| 79 |
+
"key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
|
| 80 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
{
|
| 82 |
"type": "image",
|
| 83 |
+
"key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
|
| 84 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
{
|
| 86 |
"type": "image",
|
| 87 |
+
"key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
|
| 88 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
{
|
| 90 |
"type": "image",
|
| 91 |
+
"key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
|
| 92 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
{
|
| 94 |
"type": "image",
|
| 95 |
+
"key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
|
| 96 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
{
|
| 98 |
"type": "image",
|
| 99 |
+
"key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
|
| 100 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
{
|
| 102 |
"type": "image",
|
| 103 |
+
"key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
|
| 104 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
{
|
| 106 |
"type": "image",
|
| 107 |
+
"key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
|
| 108 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
{
|
| 110 |
"type": "image",
|
| 111 |
+
"key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
|
| 112 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
{
|
| 114 |
"type": "image",
|
| 115 |
+
"key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
|
| 116 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
{
|
| 118 |
"type": "image",
|
| 119 |
+
"key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
|
| 120 |
}
|
| 121 |
],
|
| 122 |
"children": []
|
topic_extraction.log
CHANGED
|
@@ -6931,3 +6931,555 @@ and series'. Using page 7.
|
|
| 6931 |
2025-03-04 16:17:43,682 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
| 6932 |
2025-03-04 16:17:43,995 [INFO] __main__ - GPU memory cleaned up.
|
| 6933 |
2025-03-04 16:17:44,000 [INFO] __main__ - Processing completed successfully.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6931 |
2025-03-04 16:17:43,682 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
| 6932 |
2025-03-04 16:17:43,995 [INFO] __main__ - GPU memory cleaned up.
|
| 6933 |
2025-03-04 16:17:44,000 [INFO] __main__ - Processing completed successfully.
|
| 6934 |
+
2025-03-04 16:39:05,313 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
|
| 6935 |
+
2025-03-04 16:39:06,086 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
|
| 6936 |
+
2025-03-04 16:39:06,088 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
|
| 6937 |
+
2025-03-04 16:39:06,326 [INFO] __main__ - Computed global offset: 0
|
| 6938 |
+
2025-03-04 16:39:06,326 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
| 6939 |
+
2025-03-04 16:39:49,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
|
| 6940 |
+
2025-03-04 16:39:49,708 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
|
| 6941 |
+
2025-03-04 16:39:50,157 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
|
| 6942 |
+
2025-03-04 16:39:50,688 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
|
| 6943 |
+
2025-03-04 16:39:51,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
|
| 6944 |
+
2025-03-04 16:39:51,533 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
|
| 6945 |
+
2025-03-04 16:39:52,100 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
|
| 6946 |
+
2025-03-04 16:39:52,532 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
|
| 6947 |
+
2025-03-04 16:39:52,942 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
|
| 6948 |
+
2025-03-04 16:39:53,244 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
|
| 6949 |
+
2025-03-04 16:39:53,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
|
| 6950 |
+
2025-03-04 16:39:54,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
|
| 6951 |
+
2025-03-04 16:39:54,761 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
|
| 6952 |
+
2025-03-04 16:39:55,050 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
|
| 6953 |
+
2025-03-04 16:39:55,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
|
| 6954 |
+
2025-03-04 16:39:56,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
|
| 6955 |
+
2025-03-04 16:39:56,780 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
|
| 6956 |
+
2025-03-04 16:39:57,175 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
|
| 6957 |
+
2025-03-04 16:39:57,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
|
| 6958 |
+
2025-03-04 16:39:58,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
|
| 6959 |
+
2025-03-04 16:39:58,682 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
|
| 6960 |
+
2025-03-04 16:39:59,190 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
|
| 6961 |
+
2025-03-04 16:39:59,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
|
| 6962 |
+
2025-03-04 16:40:00,229 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
|
| 6963 |
+
2025-03-04 16:40:00,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
|
| 6964 |
+
2025-03-04 16:40:01,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
|
| 6965 |
+
2025-03-04 16:40:01,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
|
| 6966 |
+
2025-03-04 16:40:02,236 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
|
| 6967 |
+
2025-03-04 16:40:02,621 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
|
| 6968 |
+
2025-03-04 16:40:03,051 [INFO] __main__ - Classifying images to detect tables.
|
| 6969 |
+
2025-03-04 16:40:06,927 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
|
| 6970 |
+
2025-03-04 16:40:10,403 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
|
| 6971 |
+
2025-03-04 16:40:11,481 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
|
| 6972 |
+
2025-03-04 16:40:12,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
|
| 6973 |
+
2025-03-04 16:40:13,767 [ERROR] __main__ - Gemini subtopic identification error on attempt 0: Expecting value: line 1 column 1 (char 0)
|
| 6974 |
+
2025-03-04 16:40:15,308 [ERROR] __main__ - Gemini subtopic identification error on attempt 1: Expecting value: line 1 column 1 (char 0)
|
| 6975 |
+
2025-03-04 16:40:15,585 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
|
| 6976 |
+
2025-03-04 16:40:18,265 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c1.png
|
| 6977 |
+
2025-03-04 16:40:19,708 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
|
| 6978 |
+
2025-03-04 16:40:20,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c1.png
|
| 6979 |
+
2025-03-04 16:40:22,033 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
| 6980 |
+
2025-03-04 16:40:22,999 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
| 6981 |
+
2025-03-04 16:40:22,999 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
|
| 6982 |
+
2025-03-04 16:40:26,396 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
|
| 6983 |
+
2025-03-04 16:40:27,834 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
|
| 6984 |
+
2025-03-04 16:40:29,314 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
|
| 6985 |
+
2025-03-04 16:40:30,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
|
| 6986 |
+
2025-03-04 16:40:32,068 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c1.png
|
| 6987 |
+
2025-03-04 16:40:33,239 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
|
| 6988 |
+
2025-03-04 16:40:34,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c1.png
|
| 6989 |
+
2025-03-04 16:40:35,597 [WARNING] __main__ - Cell image not found: /tmp/tmpkr6p74mz.jpg_rows/row_4/col_0.png
|
| 6990 |
+
2025-03-04 16:40:35,598 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
|
| 6991 |
+
2025-03-04 16:40:38,470 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
|
| 6992 |
+
2025-03-04 16:40:39,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
|
| 6993 |
+
2025-03-04 16:40:41,236 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
|
| 6994 |
+
2025-03-04 16:40:42,293 [WARNING] __main__ - Cell image not found: /tmp/tmp_4ioykgq.jpg_rows/row_2/col_0.png
|
| 6995 |
+
2025-03-04 16:40:42,293 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
|
| 6996 |
+
2025-03-04 16:40:44,974 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_0/col_0.png
|
| 6997 |
+
2025-03-04 16:40:44,974 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_0/col_1.png
|
| 6998 |
+
2025-03-04 16:40:45,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
|
| 6999 |
+
2025-03-04 16:40:46,109 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_1/col_1.png
|
| 7000 |
+
2025-03-04 16:40:46,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
|
| 7001 |
+
2025-03-04 16:40:47,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c1.png
|
| 7002 |
+
2025-03-04 16:40:58,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
|
| 7003 |
+
2025-03-04 16:40:59,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c1.png
|
| 7004 |
+
2025-03-04 16:41:00,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
| 7005 |
+
2025-03-04 16:41:01,935 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
| 7006 |
+
2025-03-04 16:41:01,936 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
|
| 7007 |
+
2025-03-04 16:41:03,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
|
| 7008 |
+
2025-03-04 16:41:04,211 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
|
| 7009 |
+
2025-03-04 16:41:05,575 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
|
| 7010 |
+
2025-03-04 16:41:06,564 [WARNING] __main__ - Cell image not found: /tmp/tmph28hdp5v.jpg_rows/row_2/col_0.png
|
| 7011 |
+
2025-03-04 16:41:06,564 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
|
| 7012 |
+
2025-03-04 16:41:08,734 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
|
| 7013 |
+
2025-03-04 16:41:09,813 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
|
| 7014 |
+
2025-03-04 16:41:11,241 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
|
| 7015 |
+
2025-03-04 16:41:12,566 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
| 7016 |
+
2025-03-04 16:41:13,534 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
| 7017 |
+
2025-03-04 16:41:13,535 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
|
| 7018 |
+
2025-03-04 16:41:17,251 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
|
| 7019 |
+
2025-03-04 16:41:18,440 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
|
| 7020 |
+
2025-03-04 16:41:20,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
|
| 7021 |
+
2025-03-04 16:41:21,465 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
|
| 7022 |
+
2025-03-04 16:41:22,836 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c1.png
|
| 7023 |
+
2025-03-04 16:41:24,168 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
| 7024 |
+
2025-03-04 16:41:25,738 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
| 7025 |
+
2025-03-04 16:41:25,738 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
|
| 7026 |
+
2025-03-04 16:41:28,044 [WARNING] __main__ - Cell image not found: /tmp/tmp2s0xxpac.jpg_rows/row_0/col_0.png
|
| 7027 |
+
2025-03-04 16:41:28,044 [WARNING] __main__ - Cell image not found: /tmp/tmp2s0xxpac.jpg_rows/row_0/col_1.png
|
| 7028 |
+
2025-03-04 16:41:28,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
|
| 7029 |
+
2025-03-04 16:41:29,465 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
|
| 7030 |
+
2025-03-04 16:41:30,769 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
|
| 7031 |
+
2025-03-04 16:41:32,249 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
|
| 7032 |
+
2025-03-04 16:41:33,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
|
| 7033 |
+
2025-03-04 16:41:34,812 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
| 7034 |
+
2025-03-04 16:41:35,736 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
| 7035 |
+
2025-03-04 16:41:35,737 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
|
| 7036 |
+
2025-03-04 16:41:38,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
|
| 7037 |
+
2025-03-04 16:41:39,194 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
|
| 7038 |
+
2025-03-04 16:41:40,315 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
|
| 7039 |
+
2025-03-04 16:41:41,569 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
|
| 7040 |
+
2025-03-04 16:41:42,914 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c1.png
|
| 7041 |
+
2025-03-04 16:41:44,447 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
|
| 7042 |
+
2025-03-04 16:41:45,567 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c1.png
|
| 7043 |
+
2025-03-04 16:41:46,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
| 7044 |
+
2025-03-04 16:41:47,934 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
| 7045 |
+
2025-03-04 16:41:47,935 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
|
| 7046 |
+
2025-03-04 16:41:48,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
|
| 7047 |
+
2025-03-04 16:41:49,986 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
|
| 7048 |
+
2025-03-04 16:41:51,601 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
|
| 7049 |
+
2025-03-04 16:41:52,769 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
| 7050 |
+
2025-03-04 16:41:53,870 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
| 7051 |
+
2025-03-04 16:41:53,871 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
|
| 7052 |
+
2025-03-04 16:41:56,443 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
|
| 7053 |
+
2025-03-04 16:41:57,555 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
|
| 7054 |
+
2025-03-04 16:41:59,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c1.png
|
| 7055 |
+
2025-03-04 16:42:00,476 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
| 7056 |
+
2025-03-04 16:42:01,609 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
| 7057 |
+
2025-03-04 16:42:01,610 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
|
| 7058 |
+
2025-03-04 16:42:04,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
|
| 7059 |
+
2025-03-04 16:42:05,501 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
|
| 7060 |
+
2025-03-04 16:42:07,023 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
|
| 7061 |
+
2025-03-04 16:42:08,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
| 7062 |
+
2025-03-04 16:42:09,424 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
| 7063 |
+
2025-03-04 16:42:09,425 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
|
| 7064 |
+
2025-03-04 16:42:12,614 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
| 7065 |
+
2025-03-04 16:42:13,436 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
| 7066 |
+
2025-03-04 16:42:13,691 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
|
| 7067 |
+
2025-03-04 16:42:14,930 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
|
| 7068 |
+
2025-03-04 16:42:16,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
|
| 7069 |
+
2025-03-04 16:42:17,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
| 7070 |
+
2025-03-04 16:42:18,797 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
| 7071 |
+
2025-03-04 16:42:19,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
|
| 7072 |
+
2025-03-04 16:42:20,378 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c1.png
|
| 7073 |
+
2025-03-04 16:42:21,565 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
| 7074 |
+
2025-03-04 16:42:22,635 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
| 7075 |
+
2025-03-04 16:42:22,635 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
|
| 7076 |
+
2025-03-04 16:42:23,713 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
|
| 7077 |
+
2025-03-04 16:42:24,787 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
|
| 7078 |
+
2025-03-04 16:42:26,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
|
| 7079 |
+
2025-03-04 16:42:27,195 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
| 7080 |
+
2025-03-04 16:42:28,273 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
| 7081 |
+
2025-03-04 16:42:28,274 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
|
| 7082 |
+
2025-03-04 16:42:32,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
|
| 7083 |
+
2025-03-04 16:42:33,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
|
| 7084 |
+
2025-03-04 16:42:35,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
|
| 7085 |
+
2025-03-04 16:42:36,362 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
|
| 7086 |
+
2025-03-04 16:42:37,790 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
|
| 7087 |
+
2025-03-04 16:42:38,877 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
|
| 7088 |
+
2025-03-04 16:42:40,011 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
|
| 7089 |
+
2025-03-04 16:42:41,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
| 7090 |
+
2025-03-04 16:42:42,019 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
| 7091 |
+
2025-03-04 16:42:42,020 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
|
| 7092 |
+
2025-03-04 16:42:45,163 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
|
| 7093 |
+
2025-03-04 16:42:46,253 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
|
| 7094 |
+
2025-03-04 16:42:47,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
|
| 7095 |
+
2025-03-04 16:42:48,812 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
|
| 7096 |
+
2025-03-04 16:42:50,033 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c1.png
|
| 7097 |
+
2025-03-04 16:42:51,432 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
|
| 7098 |
+
2025-03-04 16:42:52,858 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
|
| 7099 |
+
2025-03-04 16:42:54,216 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
|
| 7100 |
+
2025-03-04 16:42:55,778 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c1.png
|
| 7101 |
+
2025-03-04 16:42:56,931 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
| 7102 |
+
2025-03-04 16:42:57,851 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
| 7103 |
+
2025-03-04 16:42:57,851 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
|
| 7104 |
+
2025-03-04 16:43:01,201 [WARNING] __main__ - Cell image not found: /tmp/tmpdmvh3rc8.jpg_rows/row_0/col_0.png
|
| 7105 |
+
2025-03-04 16:43:01,475 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
|
| 7106 |
+
2025-03-04 16:43:02,567 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
|
| 7107 |
+
2025-03-04 16:43:04,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c1.png
|
| 7108 |
+
2025-03-04 16:43:05,365 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
|
| 7109 |
+
2025-03-04 16:43:06,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
|
| 7110 |
+
2025-03-04 16:43:07,969 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
| 7111 |
+
2025-03-04 16:43:08,946 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
| 7112 |
+
2025-03-04 16:43:08,946 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
|
| 7113 |
+
2025-03-04 16:43:10,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
|
| 7114 |
+
2025-03-04 16:43:11,925 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
|
| 7115 |
+
2025-03-04 16:43:13,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
|
| 7116 |
+
2025-03-04 16:43:14,651 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
|
| 7117 |
+
2025-03-04 16:43:16,070 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c1.png
|
| 7118 |
+
2025-03-04 16:43:17,209 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
| 7119 |
+
2025-03-04 16:43:18,206 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
| 7120 |
+
2025-03-04 16:43:18,206 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
|
| 7121 |
+
2025-03-04 16:43:21,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
|
| 7122 |
+
2025-03-04 16:43:22,479 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
|
| 7123 |
+
2025-03-04 16:43:24,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
|
| 7124 |
+
2025-03-04 16:43:25,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
|
| 7125 |
+
2025-03-04 16:43:26,815 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
|
| 7126 |
+
2025-03-04 16:43:28,078 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
| 7127 |
+
2025-03-04 16:43:29,086 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
| 7128 |
+
2025-03-04 16:43:29,086 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
|
| 7129 |
+
2025-03-04 16:43:30,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
|
| 7130 |
+
2025-03-04 16:43:32,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
|
| 7131 |
+
2025-03-04 16:43:33,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
|
| 7132 |
+
2025-03-04 16:43:34,592 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
|
| 7133 |
+
2025-03-04 16:43:36,080 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c1.png
|
| 7134 |
+
2025-03-04 16:43:37,530 [WARNING] __main__ - Cell image not found: /tmp/tmp6_d2lvpn.jpg_rows/row_3/col_0.png
|
| 7135 |
+
2025-03-04 16:43:37,531 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
|
| 7136 |
+
2025-03-04 16:43:40,529 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
|
| 7137 |
+
2025-03-04 16:43:41,854 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
|
| 7138 |
+
2025-03-04 16:43:43,415 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
|
| 7139 |
+
2025-03-04 16:43:45,170 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
| 7140 |
+
2025-03-04 16:43:46,291 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
| 7141 |
+
2025-03-04 16:43:46,292 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
|
| 7142 |
+
2025-03-04 16:43:48,973 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
|
| 7143 |
+
2025-03-04 16:43:50,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
|
| 7144 |
+
2025-03-04 16:43:51,618 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
|
| 7145 |
+
2025-03-04 16:43:52,724 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
|
| 7146 |
+
2025-03-04 16:43:52,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
| 7147 |
+
2025-03-04 16:43:54,163 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
| 7148 |
+
2025-03-04 16:43:54,163 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
|
| 7149 |
+
2025-03-04 16:43:56,200 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
|
| 7150 |
+
2025-03-04 16:43:57,589 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
|
| 7151 |
+
2025-03-04 16:43:59,010 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
|
| 7152 |
+
2025-03-04 16:44:00,106 [WARNING] __main__ - Cell image not found: /tmp/tmp5l7mn427.jpg_rows/row_2/col_0.png
|
| 7153 |
+
2025-03-04 16:44:00,107 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
|
| 7154 |
+
2025-03-04 16:44:03,906 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
|
| 7155 |
+
2025-03-04 16:44:05,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
|
| 7156 |
+
2025-03-04 16:44:06,699 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
|
| 7157 |
+
2025-03-04 16:44:08,013 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
|
| 7158 |
+
2025-03-04 16:44:08,014 [WARNING] __main__ - Cell image not found: /tmp/tmp7f4e012v.jpg_rows/row_2/col_0.png
|
| 7159 |
+
2025-03-04 16:44:08,019 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
|
| 7160 |
+
2025-03-04 16:44:10,562 [WARNING] __main__ - Cell image not found: /tmp/tmpsr1107vb.jpg_rows/row_0/col_0.png
|
| 7161 |
+
2025-03-04 16:44:10,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
|
| 7162 |
+
2025-03-04 16:44:12,067 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
|
| 7163 |
+
2025-03-04 16:44:13,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
|
| 7164 |
+
2025-03-04 16:44:15,001 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
| 7165 |
+
2025-03-04 16:44:16,162 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
| 7166 |
+
2025-03-04 16:44:16,163 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
|
| 7167 |
+
2025-03-04 16:44:18,257 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
|
| 7168 |
+
2025-03-04 16:44:19,367 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
|
| 7169 |
+
2025-03-04 16:44:20,866 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
|
| 7170 |
+
2025-03-04 16:44:22,029 [WARNING] __main__ - Cell image not found: /tmp/tmpm4jr09co.jpg_rows/row_2/col_0.png
|
| 7171 |
+
2025-03-04 16:44:22,030 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
|
| 7172 |
+
2025-03-04 16:44:25,458 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
|
| 7173 |
+
2025-03-04 16:44:26,636 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
|
| 7174 |
+
2025-03-04 16:44:28,117 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
|
| 7175 |
+
2025-03-04 16:44:29,316 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
|
| 7176 |
+
2025-03-04 16:44:30,892 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c1.png
|
| 7177 |
+
2025-03-04 16:44:32,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
| 7178 |
+
2025-03-04 16:44:32,983 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
| 7179 |
+
2025-03-04 16:44:32,984 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
|
| 7180 |
+
2025-03-04 16:44:35,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
|
| 7181 |
+
2025-03-04 16:44:37,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
|
| 7182 |
+
2025-03-04 16:44:38,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c1.png
|
| 7183 |
+
2025-03-04 16:44:40,000 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
| 7184 |
+
2025-03-04 16:44:41,005 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
| 7185 |
+
2025-03-04 16:44:41,006 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
|
| 7186 |
+
2025-03-04 16:44:42,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
|
| 7187 |
+
2025-03-04 16:44:43,877 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
|
| 7188 |
+
2025-03-04 16:44:45,297 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c1.png
|
| 7189 |
+
2025-03-04 16:44:46,572 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
| 7190 |
+
2025-03-04 16:44:47,560 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
| 7191 |
+
2025-03-04 16:44:47,564 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
| 7192 |
+
2025-03-04 16:44:47,893 [INFO] __main__ - GPU memory cleaned up.
|
| 7193 |
+
2025-03-04 16:44:47,898 [INFO] __main__ - Processing completed successfully.
|
| 7194 |
+
2025-03-04 17:13:14,000 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
|
| 7195 |
+
2025-03-04 17:13:14,813 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
|
| 7196 |
+
2025-03-04 17:13:14,814 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
|
| 7197 |
+
2025-03-04 17:13:15,049 [INFO] __main__ - Computed global offset: 0
|
| 7198 |
+
2025-03-04 17:13:15,049 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
| 7199 |
+
2025-03-04 17:13:55,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
|
| 7200 |
+
2025-03-04 17:13:56,487 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
|
| 7201 |
+
2025-03-04 17:13:56,943 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
|
| 7202 |
+
2025-03-04 17:13:57,441 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
|
| 7203 |
+
2025-03-04 17:13:57,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
|
| 7204 |
+
2025-03-04 17:13:58,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
|
| 7205 |
+
2025-03-04 17:13:58,724 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
|
| 7206 |
+
2025-03-04 17:13:59,172 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
|
| 7207 |
+
2025-03-04 17:13:59,579 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
|
| 7208 |
+
2025-03-04 17:13:59,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
|
| 7209 |
+
2025-03-04 17:14:00,375 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
|
| 7210 |
+
2025-03-04 17:14:00,860 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
|
| 7211 |
+
2025-03-04 17:14:01,418 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
|
| 7212 |
+
2025-03-04 17:14:01,705 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
|
| 7213 |
+
2025-03-04 17:14:02,299 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
|
| 7214 |
+
2025-03-04 17:14:02,835 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
|
| 7215 |
+
2025-03-04 17:14:03,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
|
| 7216 |
+
2025-03-04 17:14:03,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
|
| 7217 |
+
2025-03-04 17:14:04,256 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
|
| 7218 |
+
2025-03-04 17:14:04,622 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
|
| 7219 |
+
2025-03-04 17:14:05,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
|
| 7220 |
+
2025-03-04 17:14:05,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
|
| 7221 |
+
2025-03-04 17:14:06,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
|
| 7222 |
+
2025-03-04 17:14:06,624 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
|
| 7223 |
+
2025-03-04 17:14:07,057 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
|
| 7224 |
+
2025-03-04 17:14:07,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
|
| 7225 |
+
2025-03-04 17:14:08,013 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
|
| 7226 |
+
2025-03-04 17:14:08,559 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
|
| 7227 |
+
2025-03-04 17:14:08,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
|
| 7228 |
+
2025-03-04 17:14:09,370 [INFO] __main__ - Classifying images to detect tables.
|
| 7229 |
+
2025-03-04 17:14:13,356 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
|
| 7230 |
+
2025-03-04 17:14:16,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
|
| 7231 |
+
2025-03-04 17:14:17,824 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
|
| 7232 |
+
2025-03-04 17:14:19,207 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
|
| 7233 |
+
2025-03-04 17:14:20,785 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
|
| 7234 |
+
2025-03-04 17:14:22,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c1.png
|
| 7235 |
+
2025-03-04 17:14:24,117 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
|
| 7236 |
+
2025-03-04 17:14:25,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c1.png
|
| 7237 |
+
2025-03-04 17:14:26,797 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
| 7238 |
+
2025-03-04 17:14:27,715 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
| 7239 |
+
2025-03-04 17:14:27,715 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
|
| 7240 |
+
2025-03-04 17:14:31,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
|
| 7241 |
+
2025-03-04 17:14:32,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
|
| 7242 |
+
2025-03-04 17:14:34,010 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
|
| 7243 |
+
2025-03-04 17:14:37,127 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
|
| 7244 |
+
2025-03-04 17:14:38,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c1.png
|
| 7245 |
+
2025-03-04 17:14:40,014 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
|
| 7246 |
+
2025-03-04 17:14:41,453 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c1.png
|
| 7247 |
+
2025-03-04 17:14:43,026 [WARNING] __main__ - Cell image not found: /tmp/tmpgz3m3b9n.jpg_rows/row_4/col_0.png
|
| 7248 |
+
2025-03-04 17:14:43,026 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
|
| 7249 |
+
2025-03-04 17:14:45,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
|
| 7250 |
+
2025-03-04 17:14:46,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
|
| 7251 |
+
2025-03-04 17:14:48,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
|
| 7252 |
+
2025-03-04 17:14:49,517 [WARNING] __main__ - Cell image not found: /tmp/tmpkn9damk4.jpg_rows/row_2/col_0.png
|
| 7253 |
+
2025-03-04 17:14:49,518 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
|
| 7254 |
+
2025-03-04 17:14:51,857 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_0/col_0.png
|
| 7255 |
+
2025-03-04 17:14:51,857 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_0/col_1.png
|
| 7256 |
+
2025-03-04 17:14:52,128 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
|
| 7257 |
+
2025-03-04 17:14:53,246 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_1/col_1.png
|
| 7258 |
+
2025-03-04 17:14:53,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
|
| 7259 |
+
2025-03-04 17:14:54,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c1.png
|
| 7260 |
+
2025-03-04 17:14:56,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
|
| 7261 |
+
2025-03-04 17:14:57,958 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c1.png
|
| 7262 |
+
2025-03-04 17:14:59,510 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
| 7263 |
+
2025-03-04 17:15:00,493 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
| 7264 |
+
2025-03-04 17:15:00,494 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
|
| 7265 |
+
2025-03-04 17:15:01,571 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
|
| 7266 |
+
2025-03-04 17:15:02,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
|
| 7267 |
+
2025-03-04 17:15:04,424 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
|
| 7268 |
+
2025-03-04 17:15:05,776 [WARNING] __main__ - Cell image not found: /tmp/tmpr68pawul.jpg_rows/row_2/col_0.png
|
| 7269 |
+
2025-03-04 17:15:05,776 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
|
| 7270 |
+
2025-03-04 17:15:07,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
|
| 7271 |
+
2025-03-04 17:15:09,073 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
|
| 7272 |
+
2025-03-04 17:15:10,711 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
|
| 7273 |
+
2025-03-04 17:15:12,507 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
| 7274 |
+
2025-03-04 17:15:13,630 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
| 7275 |
+
2025-03-04 17:15:13,631 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
|
| 7276 |
+
2025-03-04 17:15:16,878 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
|
| 7277 |
+
2025-03-04 17:15:18,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
|
| 7278 |
+
2025-03-04 17:15:19,949 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
|
| 7279 |
+
2025-03-04 17:15:22,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
|
| 7280 |
+
2025-03-04 17:15:23,888 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c1.png
|
| 7281 |
+
2025-03-04 17:15:25,222 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
| 7282 |
+
2025-03-04 17:15:26,200 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
| 7283 |
+
2025-03-04 17:15:26,200 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
|
| 7284 |
+
2025-03-04 17:15:28,378 [WARNING] __main__ - Cell image not found: /tmp/tmpeauayzcm.jpg_rows/row_0/col_0.png
|
| 7285 |
+
2025-03-04 17:15:28,378 [WARNING] __main__ - Cell image not found: /tmp/tmpeauayzcm.jpg_rows/row_0/col_1.png
|
| 7286 |
+
2025-03-04 17:15:28,642 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
|
| 7287 |
+
2025-03-04 17:15:30,092 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
|
| 7288 |
+
2025-03-04 17:15:31,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
|
| 7289 |
+
2025-03-04 17:15:33,367 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
|
| 7290 |
+
2025-03-04 17:15:34,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
|
| 7291 |
+
2025-03-04 17:15:36,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
| 7292 |
+
2025-03-04 17:15:37,395 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
| 7293 |
+
2025-03-04 17:15:37,396 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
|
| 7294 |
+
2025-03-04 17:15:39,469 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
|
| 7295 |
+
2025-03-04 17:15:40,865 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
|
| 7296 |
+
2025-03-04 17:15:42,177 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
|
| 7297 |
+
2025-03-04 17:15:43,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
|
| 7298 |
+
2025-03-04 17:15:45,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c1.png
|
| 7299 |
+
2025-03-04 17:15:47,334 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
|
| 7300 |
+
2025-03-04 17:15:48,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c1.png
|
| 7301 |
+
2025-03-04 17:15:49,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
| 7302 |
+
2025-03-04 17:15:50,582 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
| 7303 |
+
2025-03-04 17:15:50,582 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
|
| 7304 |
+
2025-03-04 17:15:51,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
|
| 7305 |
+
2025-03-04 17:15:53,041 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
|
| 7306 |
+
2025-03-04 17:15:54,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
|
| 7307 |
+
2025-03-04 17:15:55,844 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
| 7308 |
+
2025-03-04 17:15:56,966 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
| 7309 |
+
2025-03-04 17:15:56,967 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
|
| 7310 |
+
2025-03-04 17:15:59,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
|
| 7311 |
+
2025-03-04 17:16:00,804 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
|
| 7312 |
+
2025-03-04 17:16:02,580 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c1.png
|
| 7313 |
+
2025-03-04 17:16:04,173 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
| 7314 |
+
2025-03-04 17:16:05,062 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
| 7315 |
+
2025-03-04 17:16:05,062 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
|
| 7316 |
+
2025-03-04 17:16:07,653 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
|
| 7317 |
+
2025-03-04 17:16:09,201 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
|
| 7318 |
+
2025-03-04 17:16:10,928 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
|
| 7319 |
+
2025-03-04 17:16:12,739 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
| 7320 |
+
2025-03-04 17:16:13,735 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
| 7321 |
+
2025-03-04 17:16:13,735 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
|
| 7322 |
+
2025-03-04 17:16:16,756 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
| 7323 |
+
2025-03-04 17:16:17,689 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
| 7324 |
+
2025-03-04 17:16:17,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
|
| 7325 |
+
2025-03-04 17:16:19,521 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
|
| 7326 |
+
2025-03-04 17:16:21,310 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
|
| 7327 |
+
2025-03-04 17:16:23,370 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
| 7328 |
+
2025-03-04 17:16:24,380 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
| 7329 |
+
2025-03-04 17:16:24,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
|
| 7330 |
+
2025-03-04 17:16:26,009 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c1.png
|
| 7331 |
+
2025-03-04 17:16:27,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
| 7332 |
+
2025-03-04 17:16:28,943 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
| 7333 |
+
2025-03-04 17:16:28,943 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
|
| 7334 |
+
2025-03-04 17:16:30,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
|
| 7335 |
+
2025-03-04 17:16:31,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
|
| 7336 |
+
2025-03-04 17:16:32,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
|
| 7337 |
+
2025-03-04 17:16:34,357 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
| 7338 |
+
2025-03-04 17:16:35,420 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
| 7339 |
+
2025-03-04 17:16:35,420 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
|
| 7340 |
+
2025-03-04 17:16:38,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
|
| 7341 |
+
2025-03-04 17:16:40,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
|
| 7342 |
+
2025-03-04 17:16:42,086 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
|
| 7343 |
+
2025-03-04 17:16:43,960 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
|
| 7344 |
+
2025-03-04 17:16:45,362 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
|
| 7345 |
+
2025-03-04 17:16:47,152 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
|
| 7346 |
+
2025-03-04 17:16:48,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
|
| 7347 |
+
2025-03-04 17:16:49,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
| 7348 |
+
2025-03-04 17:16:51,054 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
| 7349 |
+
2025-03-04 17:16:51,054 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
|
| 7350 |
+
2025-03-04 17:16:54,130 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
|
| 7351 |
+
2025-03-04 17:16:55,482 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
|
| 7352 |
+
2025-03-04 17:16:56,770 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
|
| 7353 |
+
2025-03-04 17:16:58,649 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
|
| 7354 |
+
2025-03-04 17:16:59,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c1.png
|
| 7355 |
+
2025-03-04 17:17:01,397 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
|
| 7356 |
+
2025-03-04 17:17:02,716 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
|
| 7357 |
+
2025-03-04 17:17:04,306 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
|
| 7358 |
+
2025-03-04 17:17:05,735 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c1.png
|
| 7359 |
+
2025-03-04 17:17:07,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
| 7360 |
+
2025-03-04 17:17:08,712 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
| 7361 |
+
2025-03-04 17:17:08,712 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
|
| 7362 |
+
2025-03-04 17:17:11,791 [WARNING] __main__ - Cell image not found: /tmp/tmp8t199g9l.jpg_rows/row_0/col_0.png
|
| 7363 |
+
2025-03-04 17:17:12,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
|
| 7364 |
+
2025-03-04 17:17:13,388 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
|
| 7365 |
+
2025-03-04 17:17:14,808 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c1.png
|
| 7366 |
+
2025-03-04 17:17:16,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
|
| 7367 |
+
2025-03-04 17:17:18,305 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
|
| 7368 |
+
2025-03-04 17:17:20,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
| 7369 |
+
2025-03-04 17:17:21,082 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
| 7370 |
+
2025-03-04 17:17:21,082 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
|
| 7371 |
+
2025-03-04 17:17:22,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
|
| 7372 |
+
2025-03-04 17:17:24,349 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
|
| 7373 |
+
2025-03-04 17:17:25,674 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
|
| 7374 |
+
2025-03-04 17:17:27,576 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
|
| 7375 |
+
2025-03-04 17:17:28,875 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c1.png
|
| 7376 |
+
2025-03-04 17:17:30,338 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
| 7377 |
+
2025-03-04 17:17:31,459 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
| 7378 |
+
2025-03-04 17:17:31,460 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
|
| 7379 |
+
2025-03-04 17:17:33,895 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
|
| 7380 |
+
2025-03-04 17:17:35,505 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
|
| 7381 |
+
2025-03-04 17:17:36,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
|
| 7382 |
+
2025-03-04 17:17:38,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
|
| 7383 |
+
2025-03-04 17:17:40,159 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
|
| 7384 |
+
2025-03-04 17:17:42,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
| 7385 |
+
2025-03-04 17:17:43,069 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
| 7386 |
+
2025-03-04 17:17:43,069 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
|
| 7387 |
+
2025-03-04 17:17:44,770 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
|
| 7388 |
+
2025-03-04 17:17:46,112 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
|
| 7389 |
+
2025-03-04 17:17:47,369 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
|
| 7390 |
+
2025-03-04 17:17:48,764 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
|
| 7391 |
+
2025-03-04 17:17:50,279 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c1.png
|
| 7392 |
+
2025-03-04 17:17:52,008 [WARNING] __main__ - Cell image not found: /tmp/tmpyuhd9sl8.jpg_rows/row_3/col_0.png
|
| 7393 |
+
2025-03-04 17:17:52,009 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
|
| 7394 |
+
2025-03-04 17:17:54,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
|
| 7395 |
+
2025-03-04 17:17:56,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
|
| 7396 |
+
2025-03-04 17:17:58,121 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
|
| 7397 |
+
2025-03-04 17:18:00,408 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
| 7398 |
+
2025-03-04 17:18:01,418 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
| 7399 |
+
2025-03-04 17:18:01,418 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
|
| 7400 |
+
2025-03-04 17:18:03,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
|
| 7401 |
+
2025-03-04 17:18:05,292 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
|
| 7402 |
+
2025-03-04 17:18:07,082 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
|
| 7403 |
+
2025-03-04 17:18:08,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
| 7404 |
+
2025-03-04 17:18:10,012 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
| 7405 |
+
2025-03-04 17:18:10,012 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
|
| 7406 |
+
2025-03-04 17:18:11,952 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
|
| 7407 |
+
2025-03-04 17:18:13,275 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
|
| 7408 |
+
2025-03-04 17:18:14,714 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
|
| 7409 |
+
2025-03-04 17:18:16,140 [WARNING] __main__ - Cell image not found: /tmp/tmp91opcy4g.jpg_rows/row_2/col_0.png
|
| 7410 |
+
2025-03-04 17:18:16,140 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
|
| 7411 |
+
2025-03-04 17:18:19,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
|
| 7412 |
+
2025-03-04 17:18:21,092 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
|
| 7413 |
+
2025-03-04 17:18:23,324 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
|
| 7414 |
+
2025-03-04 17:18:26,880 [WARNING] __main__ - Cell image not found: /tmp/tmpigalpv91.jpg_rows/row_2/col_0.png
|
| 7415 |
+
2025-03-04 17:18:26,880 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
|
| 7416 |
+
2025-03-04 17:18:29,208 [WARNING] __main__ - Cell image not found: /tmp/tmppaoedyal.jpg_rows/row_0/col_0.png
|
| 7417 |
+
2025-03-04 17:18:29,475 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
|
| 7418 |
+
2025-03-04 17:18:30,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
|
| 7419 |
+
2025-03-04 17:18:33,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
|
| 7420 |
+
2025-03-04 17:18:36,316 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
| 7421 |
+
2025-03-04 17:18:37,482 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
| 7422 |
+
2025-03-04 17:18:37,483 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
|
| 7423 |
+
2025-03-04 17:18:39,543 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
|
| 7424 |
+
2025-03-04 17:18:40,901 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
|
| 7425 |
+
2025-03-04 17:18:42,749 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
|
| 7426 |
+
2025-03-04 17:18:44,332 [WARNING] __main__ - Cell image not found: /tmp/tmp22n5c_8q.jpg_rows/row_2/col_0.png
|
| 7427 |
+
2025-03-04 17:18:44,332 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
|
| 7428 |
+
2025-03-04 17:18:47,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
|
| 7429 |
+
2025-03-04 17:18:49,048 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
|
| 7430 |
+
2025-03-04 17:18:50,572 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
|
| 7431 |
+
2025-03-04 17:18:52,196 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
|
| 7432 |
+
2025-03-04 17:18:53,636 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c1.png
|
| 7433 |
+
2025-03-04 17:18:55,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
| 7434 |
+
2025-03-04 17:18:56,002 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
| 7435 |
+
2025-03-04 17:18:56,003 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
|
| 7436 |
+
2025-03-04 17:18:58,520 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
|
| 7437 |
+
2025-03-04 17:18:59,970 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
|
| 7438 |
+
2025-03-04 17:19:01,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c1.png
|
| 7439 |
+
2025-03-04 17:19:03,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
| 7440 |
+
2025-03-04 17:19:04,755 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
| 7441 |
+
2025-03-04 17:19:04,755 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
|
| 7442 |
+
2025-03-04 17:19:06,526 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
|
| 7443 |
+
2025-03-04 17:19:07,817 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
|
| 7444 |
+
2025-03-04 17:19:09,284 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c1.png
|
| 7445 |
+
2025-03-04 17:19:10,915 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
| 7446 |
+
2025-03-04 17:19:11,969 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
| 7447 |
+
2025-03-04 17:19:11,972 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
| 7448 |
+
2025-03-04 17:19:12,278 [INFO] __main__ - GPU memory cleaned up.
|
| 7449 |
+
2025-03-04 17:19:12,283 [INFO] __main__ - Processing completed successfully.
|
| 7450 |
+
2025-03-04 17:28:37,803 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
|
| 7451 |
+
2025-03-04 17:28:38,622 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
|
| 7452 |
+
2025-03-04 17:28:38,624 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
|
| 7453 |
+
2025-03-04 17:28:38,859 [INFO] __main__ - Computed global offset: 0
|
| 7454 |
+
2025-03-04 17:28:38,860 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
| 7455 |
+
2025-03-04 17:29:19,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
|
| 7456 |
+
2025-03-04 17:29:20,237 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
|
| 7457 |
+
2025-03-04 17:29:20,620 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
|
| 7458 |
+
2025-03-04 17:29:21,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
|
| 7459 |
+
2025-03-04 17:29:21,413 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
|
| 7460 |
+
2025-03-04 17:29:21,792 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
|
| 7461 |
+
2025-03-04 17:29:22,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
|
| 7462 |
+
2025-03-04 17:29:22,827 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
|
| 7463 |
+
2025-03-04 17:29:23,260 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
|
| 7464 |
+
2025-03-04 17:29:23,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
|
| 7465 |
+
2025-03-04 17:29:24,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
|
| 7466 |
+
2025-03-04 17:29:24,602 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
|
| 7467 |
+
2025-03-04 17:29:25,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
|
| 7468 |
+
2025-03-04 17:29:25,442 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
|
| 7469 |
+
2025-03-04 17:29:26,082 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
|
| 7470 |
+
2025-03-04 17:29:26,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
|
| 7471 |
+
2025-03-04 17:29:27,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
|
| 7472 |
+
2025-03-04 17:29:27,575 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
|
| 7473 |
+
2025-03-04 17:29:28,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
|
| 7474 |
+
2025-03-04 17:29:28,509 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
|
| 7475 |
+
2025-03-04 17:29:29,046 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
|
| 7476 |
+
2025-03-04 17:29:29,553 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
|
| 7477 |
+
2025-03-04 17:29:29,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
|
| 7478 |
+
2025-03-04 17:29:30,523 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
|
| 7479 |
+
2025-03-04 17:29:31,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
|
| 7480 |
+
2025-03-04 17:29:31,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
|
| 7481 |
+
2025-03-04 17:29:31,991 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
|
| 7482 |
+
2025-03-04 17:29:32,506 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
|
| 7483 |
+
2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
|
| 7484 |
+
2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
|
| 7485 |
+
2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.
|
topic_extraction.py
CHANGED
|
@@ -299,95 +299,113 @@ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: st
|
|
| 299 |
for attempt in range(max_retries + 1):
|
| 300 |
try:
|
| 301 |
prompt = """
|
| 302 |
-
You are given an image from an educational curriculum specification. The image may contain:
|
| 303 |
1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
|
| 304 |
-
2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6",
|
| 305 |
3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
|
| 306 |
-
4) Possibly no relevant text
|
| 307 |
|
| 308 |
Your task is to extract:
|
| 309 |
- **"title"**: A recognized main topic or heading text.
|
| 310 |
-
- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
|
| 311 |
|
| 312 |
Follow these rules:
|
| 313 |
|
| 314 |
-
(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued"
|
| 315 |
-
-
|
| 316 |
-
-
|
|
|
|
| 317 |
|
| 318 |
-
(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4"
|
| 319 |
- Collect those exact strings in the JSON key "subtopics" (an array of strings).
|
| 320 |
-
- "title" in this case should be an empty string if you only detect subtopics.
|
| 321 |
(Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
|
| 322 |
|
| 323 |
-
(3) If no main topic or subtopic is detected but the text appears to be a heading
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
| 328 |
|
| 329 |
(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
|
| 330 |
-
- Use
|
| 331 |
- "subtopics" remains empty.
|
| 332 |
-
Example:
|
| 333 |
If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
|
| 334 |
{
|
| 335 |
"title": "Scarcity, choice and opportunity cost",
|
| 336 |
"subtopics": []
|
| 337 |
}
|
| 338 |
|
| 339 |
-
(5) **If there is
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
- Put that label text into "title" (e.g. "G2").
|
| 341 |
- "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
|
| 342 |
|
| 343 |
-
(
|
| 344 |
{
|
| 345 |
"title": "...",
|
| 346 |
"subtopics": [...]
|
| 347 |
}
|
| 348 |
|
| 349 |
-
(
|
| 350 |
-
- Contains no words at all (e.g. a blank white or black image)
|
| 351 |
-
- Contains only
|
| 352 |
-
- Contains
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
**Examples**:
|
| 359 |
|
| 360 |
-
- If the image text is
|
| 361 |
{
|
| 362 |
"title": "2 Algebra and functions",
|
| 363 |
"subtopics": []
|
| 364 |
}
|
| 365 |
|
| 366 |
-
- If the image text is
|
| 367 |
{
|
| 368 |
"title": "",
|
| 369 |
"subtopics": ["2.5"]
|
| 370 |
}
|
| 371 |
|
| 372 |
-
- If the image text is
|
| 373 |
{
|
| 374 |
-
"title": "
|
| 375 |
"subtopics": []
|
| 376 |
}
|
| 377 |
|
| 378 |
-
- If the left column says
|
| 379 |
{
|
| 380 |
"title": "G2",
|
| 381 |
"subtopics": []
|
| 382 |
}
|
| 383 |
|
| 384 |
-
- If
|
| 385 |
{
|
| 386 |
-
"title": "",
|
| 387 |
"subtopics": []
|
| 388 |
}
|
| 389 |
"""
|
| 390 |
-
|
| 391 |
global _GEMINI_CLIENT
|
| 392 |
if _GEMINI_CLIENT is None:
|
| 393 |
_GEMINI_CLIENT = genai.Client(api_key=api_key)
|
|
@@ -482,7 +500,6 @@ class S3ImageWriter(DataWriter):
|
|
| 482 |
elif cls == "THREE_COLUMN":
|
| 483 |
info['final_alt'] = "HAS TO BE PROCESSED - three column table"
|
| 484 |
elif cls == "EMPTY_IMAGE":
|
| 485 |
-
# Remove markdown reference, delete from descriptions and S3.
|
| 486 |
md_content = md_content.replace(f"", "")
|
| 487 |
try:
|
| 488 |
self.s3_writer.delete(info['s3_path'])
|
|
@@ -865,7 +882,6 @@ class MineruNoTextProcessor:
|
|
| 865 |
def process(self, pdf_path: str) -> Dict[str, Any]:
|
| 866 |
logger.info(f"Processing PDF: {pdf_path}")
|
| 867 |
try:
|
| 868 |
-
# Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
|
| 869 |
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
| 870 |
logger.info(f"Gemini returned subtopics: {subtopics}")
|
| 871 |
|
|
|
|
| 299 |
for attempt in range(max_retries + 1):
|
| 300 |
try:
|
| 301 |
prompt = """
|
| 302 |
+
You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
|
| 303 |
1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
|
| 304 |
+
2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
|
| 305 |
3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
|
| 306 |
+
4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
|
| 307 |
|
| 308 |
Your task is to extract:
|
| 309 |
- **"title"**: A recognized main topic or heading text.
|
| 310 |
+
- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
|
| 311 |
|
| 312 |
Follow these rules:
|
| 313 |
|
| 314 |
+
(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
|
| 315 |
+
- Remove the word "continued" if present.
|
| 316 |
+
- Put that resulting text in "title". (e.g. "2 Algebra and functions")
|
| 317 |
+
- "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
|
| 318 |
|
| 319 |
+
(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
|
| 320 |
- Collect those exact strings in the JSON key "subtopics" (an array of strings).
|
| 321 |
+
- "title" in this case should be an empty string if you only detect subtopics.
|
| 322 |
(Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
|
| 323 |
|
| 324 |
+
(3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
|
| 325 |
+
- Return:
|
| 326 |
+
{
|
| 327 |
+
"title": "<the heading text>",
|
| 328 |
+
"subtopics": []
|
| 329 |
+
}
|
| 330 |
|
| 331 |
(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
|
| 332 |
+
- Use that left column text as "title".
|
| 333 |
- "subtopics" remains empty.
|
| 334 |
+
Example:
|
| 335 |
If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
|
| 336 |
{
|
| 337 |
"title": "Scarcity, choice and opportunity cost",
|
| 338 |
"subtopics": []
|
| 339 |
}
|
| 340 |
|
| 341 |
+
(5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
|
| 342 |
+
- "subtopics" remains empty.
|
| 343 |
+
Example:
|
| 344 |
+
If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
|
| 345 |
+
{
|
| 346 |
+
"title": "Specialisation, devision of labour and exchange",
|
| 347 |
+
"subtopics": []
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
(6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
|
| 351 |
- Put that label text into "title" (e.g. "G2").
|
| 352 |
- "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
|
| 353 |
|
| 354 |
+
(7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
|
| 355 |
{
|
| 356 |
"title": "...",
|
| 357 |
"subtopics": [...]
|
| 358 |
}
|
| 359 |
|
| 360 |
+
(8) **If the image is blank or truncated**, defined as:
|
| 361 |
+
- Contains no words at all (e.g. a blank white or black image), **OR**
|
| 362 |
+
- Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
|
| 363 |
+
- Contains partial headings with no recognizable numeric or textual headings
|
| 364 |
+
- Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
|
| 365 |
+
then return:
|
| 366 |
+
{
|
| 367 |
+
"title": "EMPTY_IMAGE",
|
| 368 |
+
"subtopics": []
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
(9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
|
| 372 |
+
{
|
| 373 |
+
"title": "EMPTY_IMAGE",
|
| 374 |
+
"subtopics": []
|
| 375 |
+
}
|
| 376 |
|
| 377 |
**Examples**:
|
| 378 |
|
| 379 |
+
- If the image text is "2 Algebra and functions continued", return:
|
| 380 |
{
|
| 381 |
"title": "2 Algebra and functions",
|
| 382 |
"subtopics": []
|
| 383 |
}
|
| 384 |
|
| 385 |
+
- If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
|
| 386 |
{
|
| 387 |
"title": "",
|
| 388 |
"subtopics": ["2.5"]
|
| 389 |
}
|
| 390 |
|
| 391 |
+
- If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
|
| 392 |
{
|
| 393 |
+
"title": "Specialisation, division of labour and exchange",
|
| 394 |
"subtopics": []
|
| 395 |
}
|
| 396 |
|
| 397 |
+
- If the left column says "G2" and the right column has details, but no subtopic numbers, return:
|
| 398 |
{
|
| 399 |
"title": "G2",
|
| 400 |
"subtopics": []
|
| 401 |
}
|
| 402 |
|
| 403 |
+
- If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
|
| 404 |
{
|
| 405 |
+
"title": "EMPTY_IMAGE",
|
| 406 |
"subtopics": []
|
| 407 |
}
|
| 408 |
"""
|
|
|
|
| 409 |
global _GEMINI_CLIENT
|
| 410 |
if _GEMINI_CLIENT is None:
|
| 411 |
_GEMINI_CLIENT = genai.Client(api_key=api_key)
|
|
|
|
| 500 |
elif cls == "THREE_COLUMN":
|
| 501 |
info['final_alt'] = "HAS TO BE PROCESSED - three column table"
|
| 502 |
elif cls == "EMPTY_IMAGE":
|
|
|
|
| 503 |
md_content = md_content.replace(f"", "")
|
| 504 |
try:
|
| 505 |
self.s3_writer.delete(info['s3_path'])
|
|
|
|
| 882 |
def process(self, pdf_path: str) -> Dict[str, Any]:
|
| 883 |
logger.info(f"Processing PDF: {pdf_path}")
|
| 884 |
try:
|
|
|
|
| 885 |
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
| 886 |
logger.info(f"Gemini returned subtopics: {subtopics}")
|
| 887 |
|
worker.py
CHANGED
|
@@ -139,6 +139,7 @@ class RabbitMQWorker:
|
|
| 139 |
try:
|
| 140 |
pdf_url = file.get("url")
|
| 141 |
logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
|
|
|
|
| 142 |
result = self.topic_processor.process(pdf_url)
|
| 143 |
context = {
|
| 144 |
"key": file.get("key", ""),
|
|
|
|
| 139 |
try:
|
| 140 |
pdf_url = file.get("url")
|
| 141 |
logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
|
| 142 |
+
|
| 143 |
result = self.topic_processor.process(pdf_url)
|
| 144 |
context = {
|
| 145 |
"key": file.get("key", ""),
|