Spaces:
Runtime error
Runtime error
Upload punkt.py
Browse files
punkt.py
ADDED
|
@@ -0,0 +1,1767 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Punkt sentence tokenizer
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2023 NLTK Project
|
| 4 |
+
# Algorithm: Kiss & Strunk (2006)
|
| 5 |
+
# Author: Willy <willy@csse.unimelb.edu.au> (original Python port)
|
| 6 |
+
# Steven Bird <stevenbird1@gmail.com> (additions)
|
| 7 |
+
# Edward Loper <edloper@gmail.com> (rewrite)
|
| 8 |
+
# Joel Nothman <jnothman@student.usyd.edu.au> (almost rewrite)
|
| 9 |
+
# Arthur Darcet <arthur@darcet.fr> (fixes)
|
| 10 |
+
# Tom Aarsen <> (tackle ReDoS & performance issues)
|
| 11 |
+
# URL: <https://www.nltk.org/>
|
| 12 |
+
# For license information, see LICENSE.TXT
|
| 13 |
+
|
| 14 |
+
r"""
|
| 15 |
+
Punkt Sentence Tokenizer
|
| 16 |
+
|
| 17 |
+
This tokenizer divides a text into a list of sentences
|
| 18 |
+
by using an unsupervised algorithm to build a model for abbreviation
|
| 19 |
+
words, collocations, and words that start sentences. It must be
|
| 20 |
+
trained on a large collection of plaintext in the target language
|
| 21 |
+
before it can be used.
|
| 22 |
+
|
| 23 |
+
The NLTK data package includes a pre-trained Punkt tokenizer for
|
| 24 |
+
English.
|
| 25 |
+
|
| 26 |
+
>>> import nltk.data
|
| 27 |
+
>>> text = '''
|
| 28 |
+
... Punkt knows that the periods in Mr. Smith and Johann S. Bach
|
| 29 |
+
... do not mark sentence boundaries. And sometimes sentences
|
| 30 |
+
... can start with non-capitalized words. i is a good variable
|
| 31 |
+
... name.
|
| 32 |
+
... '''
|
| 33 |
+
>>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
|
| 34 |
+
>>> print('\n-----\n'.join(sent_detector.tokenize(text.strip())))
|
| 35 |
+
Punkt knows that the periods in Mr. Smith and Johann S. Bach
|
| 36 |
+
do not mark sentence boundaries.
|
| 37 |
+
-----
|
| 38 |
+
And sometimes sentences
|
| 39 |
+
can start with non-capitalized words.
|
| 40 |
+
-----
|
| 41 |
+
i is a good variable
|
| 42 |
+
name.
|
| 43 |
+
|
| 44 |
+
(Note that whitespace from the original text, including newlines, is
|
| 45 |
+
retained in the output.)
|
| 46 |
+
|
| 47 |
+
Punctuation following sentences is also included by default
|
| 48 |
+
(from NLTK 3.0 onwards). It can be excluded with the realign_boundaries
|
| 49 |
+
flag.
|
| 50 |
+
|
| 51 |
+
>>> text = '''
|
| 52 |
+
... (How does it deal with this parenthesis?) "It should be part of the
|
| 53 |
+
... previous sentence." "(And the same with this one.)" ('And this one!')
|
| 54 |
+
... "('(And (this)) '?)" [(and this. )]
|
| 55 |
+
... '''
|
| 56 |
+
>>> print('\n-----\n'.join(
|
| 57 |
+
... sent_detector.tokenize(text.strip())))
|
| 58 |
+
(How does it deal with this parenthesis?)
|
| 59 |
+
-----
|
| 60 |
+
"It should be part of the
|
| 61 |
+
previous sentence."
|
| 62 |
+
-----
|
| 63 |
+
"(And the same with this one.)"
|
| 64 |
+
-----
|
| 65 |
+
('And this one!')
|
| 66 |
+
-----
|
| 67 |
+
"('(And (this)) '?)"
|
| 68 |
+
-----
|
| 69 |
+
[(and this. )]
|
| 70 |
+
>>> print('\n-----\n'.join(
|
| 71 |
+
... sent_detector.tokenize(text.strip(), realign_boundaries=False)))
|
| 72 |
+
(How does it deal with this parenthesis?
|
| 73 |
+
-----
|
| 74 |
+
) "It should be part of the
|
| 75 |
+
previous sentence.
|
| 76 |
+
-----
|
| 77 |
+
" "(And the same with this one.
|
| 78 |
+
-----
|
| 79 |
+
)" ('And this one!
|
| 80 |
+
-----
|
| 81 |
+
')
|
| 82 |
+
"('(And (this)) '?
|
| 83 |
+
-----
|
| 84 |
+
)" [(and this.
|
| 85 |
+
-----
|
| 86 |
+
)]
|
| 87 |
+
|
| 88 |
+
However, Punkt is designed to learn parameters (a list of abbreviations, etc.)
|
| 89 |
+
unsupervised from a corpus similar to the target domain. The pre-packaged models
|
| 90 |
+
may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn
|
| 91 |
+
parameters from the given text.
|
| 92 |
+
|
| 93 |
+
:class:`.PunktTrainer` learns parameters such as a list of abbreviations
|
| 94 |
+
(without supervision) from portions of text. Using a ``PunktTrainer`` directly
|
| 95 |
+
allows for incremental training and modification of the hyper-parameters used
|
| 96 |
+
to decide what is considered an abbreviation, etc.
|
| 97 |
+
|
| 98 |
+
The algorithm for this tokenizer is described in::
|
| 99 |
+
|
| 100 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
|
| 101 |
+
Boundary Detection. Computational Linguistics 32: 485-525.
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
# TODO: Make orthographic heuristic less susceptible to overtraining
|
| 105 |
+
# TODO: Frequent sentence starters optionally exclude always-capitalised words
|
| 106 |
+
# FIXME: Problem with ending string with e.g. '!!!' -> '!! !'
|
| 107 |
+
|
| 108 |
+
import math
|
| 109 |
+
import re
|
| 110 |
+
import string
|
| 111 |
+
from collections import defaultdict
|
| 112 |
+
from typing import Any, Dict, Iterator, List, Match, Optional, Tuple, Union
|
| 113 |
+
|
| 114 |
+
from nltk.probability import FreqDist
|
| 115 |
+
from nltk.tokenize.api import TokenizerI
|
| 116 |
+
|
| 117 |
+
######################################################################
|
| 118 |
+
# { Orthographic Context Constants
|
| 119 |
+
######################################################################
|
| 120 |
+
# The following constants are used to describe the orthographic
|
| 121 |
+
# contexts in which a word can occur. BEG=beginning, MID=middle,
|
| 122 |
+
# UNK=unknown, UC=uppercase, LC=lowercase, NC=no case.
|
| 123 |
+
|
| 124 |
+
_ORTHO_BEG_UC = 1 << 1
|
| 125 |
+
"""Orthographic context: beginning of a sentence with upper case."""
|
| 126 |
+
|
| 127 |
+
_ORTHO_MID_UC = 1 << 2
|
| 128 |
+
"""Orthographic context: middle of a sentence with upper case."""
|
| 129 |
+
|
| 130 |
+
_ORTHO_UNK_UC = 1 << 3
|
| 131 |
+
"""Orthographic context: unknown position in a sentence with upper case."""
|
| 132 |
+
|
| 133 |
+
_ORTHO_BEG_LC = 1 << 4
|
| 134 |
+
"""Orthographic context: beginning of a sentence with lower case."""
|
| 135 |
+
|
| 136 |
+
_ORTHO_MID_LC = 1 << 5
|
| 137 |
+
"""Orthographic context: middle of a sentence with lower case."""
|
| 138 |
+
|
| 139 |
+
_ORTHO_UNK_LC = 1 << 6
|
| 140 |
+
"""Orthographic context: unknown position in a sentence with lower case."""
|
| 141 |
+
|
| 142 |
+
_ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC
|
| 143 |
+
"""Orthographic context: occurs with upper case."""
|
| 144 |
+
|
| 145 |
+
_ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC
|
| 146 |
+
"""Orthographic context: occurs with lower case."""
|
| 147 |
+
|
| 148 |
+
_ORTHO_MAP = {
|
| 149 |
+
("initial", "upper"): _ORTHO_BEG_UC,
|
| 150 |
+
("internal", "upper"): _ORTHO_MID_UC,
|
| 151 |
+
("unknown", "upper"): _ORTHO_UNK_UC,
|
| 152 |
+
("initial", "lower"): _ORTHO_BEG_LC,
|
| 153 |
+
("internal", "lower"): _ORTHO_MID_LC,
|
| 154 |
+
("unknown", "lower"): _ORTHO_UNK_LC,
|
| 155 |
+
}
|
| 156 |
+
"""A map from context position and first-letter case to the
|
| 157 |
+
appropriate orthographic context flag."""
|
| 158 |
+
|
| 159 |
+
# } (end orthographic context constants)
|
| 160 |
+
######################################################################
|
| 161 |
+
|
| 162 |
+
######################################################################
|
| 163 |
+
# { Decision reasons for debugging
|
| 164 |
+
######################################################################
|
| 165 |
+
|
| 166 |
+
REASON_DEFAULT_DECISION = "default decision"
|
| 167 |
+
REASON_KNOWN_COLLOCATION = "known collocation (both words)"
|
| 168 |
+
REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic"
|
| 169 |
+
REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter"
|
| 170 |
+
REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
|
| 171 |
+
REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
|
| 172 |
+
REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = (
|
| 173 |
+
"initial + special orthographic heuristic"
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# } (end decision reasons for debugging)
|
| 178 |
+
######################################################################
|
| 179 |
+
|
| 180 |
+
######################################################################
|
| 181 |
+
# { Language-dependent variables
|
| 182 |
+
######################################################################
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
class PunktLanguageVars:
|
| 186 |
+
"""
|
| 187 |
+
Stores variables, mostly regular expressions, which may be
|
| 188 |
+
language-dependent for correct application of the algorithm.
|
| 189 |
+
An extension of this class may modify its properties to suit
|
| 190 |
+
a language other than English; an instance can then be passed
|
| 191 |
+
as an argument to PunktSentenceTokenizer and PunktTrainer
|
| 192 |
+
constructors.
|
| 193 |
+
"""
|
| 194 |
+
|
| 195 |
+
__slots__ = ("_re_period_context", "_re_word_tokenizer")
|
| 196 |
+
|
| 197 |
+
def __getstate__(self):
|
| 198 |
+
# All modifications to the class are performed by inheritance.
|
| 199 |
+
# Non-default parameters to be pickled must be defined in the inherited
|
| 200 |
+
# class.
|
| 201 |
+
return 1
|
| 202 |
+
|
| 203 |
+
def __setstate__(self, state):
|
| 204 |
+
return 1
|
| 205 |
+
|
| 206 |
+
sent_end_chars = (".", "?", "!")
|
| 207 |
+
"""Characters which are candidates for sentence boundaries"""
|
| 208 |
+
|
| 209 |
+
@property
|
| 210 |
+
def _re_sent_end_chars(self):
|
| 211 |
+
return "[%s]" % re.escape("".join(self.sent_end_chars))
|
| 212 |
+
|
| 213 |
+
internal_punctuation = ",:;" # might want to extend this..
|
| 214 |
+
"""sentence internal punctuation, which indicates an abbreviation if
|
| 215 |
+
preceded by a period-final token."""
|
| 216 |
+
|
| 217 |
+
re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE)
|
| 218 |
+
"""Used to realign punctuation that should be included in a sentence
|
| 219 |
+
although it follows the period (or ?, !)."""
|
| 220 |
+
|
| 221 |
+
_re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]"
|
| 222 |
+
"""Excludes some characters from starting word tokens"""
|
| 223 |
+
|
| 224 |
+
@property
|
| 225 |
+
def _re_non_word_chars(self):
|
| 226 |
+
return r"(?:[)\";}\]\*:@\'\({\[%s])" % re.escape(
|
| 227 |
+
"".join(set(self.sent_end_chars) - {"."})
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
"""Characters that cannot appear within words"""
|
| 231 |
+
|
| 232 |
+
_re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
|
| 233 |
+
"""Hyphen and ellipsis are multi-character punctuation"""
|
| 234 |
+
|
| 235 |
+
_word_tokenize_fmt = r"""(
|
| 236 |
+
%(MultiChar)s
|
| 237 |
+
|
|
| 238 |
+
(?=%(WordStart)s)\S+? # Accept word characters until end is found
|
| 239 |
+
(?= # Sequences marking a word's end
|
| 240 |
+
\s| # White-space
|
| 241 |
+
$| # End-of-string
|
| 242 |
+
%(NonWord)s|%(MultiChar)s| # Punctuation
|
| 243 |
+
,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
|
| 244 |
+
)
|
| 245 |
+
|
|
| 246 |
+
\S
|
| 247 |
+
)"""
|
| 248 |
+
"""Format of a regular expression to split punctuation from words,
|
| 249 |
+
excluding period."""
|
| 250 |
+
|
| 251 |
+
def _word_tokenizer_re(self):
|
| 252 |
+
"""Compiles and returns a regular expression for word tokenization"""
|
| 253 |
+
try:
|
| 254 |
+
return self._re_word_tokenizer
|
| 255 |
+
except AttributeError:
|
| 256 |
+
self._re_word_tokenizer = re.compile(
|
| 257 |
+
self._word_tokenize_fmt
|
| 258 |
+
% {
|
| 259 |
+
"NonWord": self._re_non_word_chars,
|
| 260 |
+
"MultiChar": self._re_multi_char_punct,
|
| 261 |
+
"WordStart": self._re_word_start,
|
| 262 |
+
},
|
| 263 |
+
re.UNICODE | re.VERBOSE,
|
| 264 |
+
)
|
| 265 |
+
return self._re_word_tokenizer
|
| 266 |
+
|
| 267 |
+
def word_tokenize(self, s):
|
| 268 |
+
"""Tokenize a string to split off punctuation other than periods"""
|
| 269 |
+
return self._word_tokenizer_re().findall(s)
|
| 270 |
+
|
| 271 |
+
_period_context_fmt = r"""
|
| 272 |
+
%(SentEndChars)s # a potential sentence ending
|
| 273 |
+
(?=(?P<after_tok>
|
| 274 |
+
%(NonWord)s # either other punctuation
|
| 275 |
+
|
|
| 276 |
+
\s+(?P<next_tok>\S+) # or whitespace and some other token
|
| 277 |
+
))"""
|
| 278 |
+
"""Format of a regular expression to find contexts including possible
|
| 279 |
+
sentence boundaries. Matches token which the possible sentence boundary
|
| 280 |
+
ends, and matches the following token within a lookahead expression."""
|
| 281 |
+
|
| 282 |
+
def period_context_re(self):
|
| 283 |
+
"""Compiles and returns a regular expression to find contexts
|
| 284 |
+
including possible sentence boundaries."""
|
| 285 |
+
try:
|
| 286 |
+
return self._re_period_context
|
| 287 |
+
except:
|
| 288 |
+
self._re_period_context = re.compile(
|
| 289 |
+
self._period_context_fmt
|
| 290 |
+
% {
|
| 291 |
+
"NonWord": self._re_non_word_chars,
|
| 292 |
+
"SentEndChars": self._re_sent_end_chars,
|
| 293 |
+
},
|
| 294 |
+
re.UNICODE | re.VERBOSE,
|
| 295 |
+
)
|
| 296 |
+
return self._re_period_context
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
_re_non_punct = re.compile(r"[^\W\d]", re.UNICODE)
|
| 300 |
+
"""Matches token types that are not merely punctuation. (Types for
|
| 301 |
+
numeric tokens are changed to ##number## and hence contain alpha.)"""
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# }
|
| 305 |
+
######################################################################
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# ////////////////////////////////////////////////////////////
|
| 309 |
+
# { Helper Functions
|
| 310 |
+
# ////////////////////////////////////////////////////////////
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def _pair_iter(iterator):
|
| 314 |
+
"""
|
| 315 |
+
Yields pairs of tokens from the given iterator such that each input
|
| 316 |
+
token will appear as the first element in a yielded tuple. The last
|
| 317 |
+
pair will have None as its second element.
|
| 318 |
+
"""
|
| 319 |
+
iterator = iter(iterator)
|
| 320 |
+
try:
|
| 321 |
+
prev = next(iterator)
|
| 322 |
+
except StopIteration:
|
| 323 |
+
return
|
| 324 |
+
for el in iterator:
|
| 325 |
+
yield (prev, el)
|
| 326 |
+
prev = el
|
| 327 |
+
yield (prev, None)
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
######################################################################
|
| 331 |
+
# { Punkt Parameters
|
| 332 |
+
######################################################################
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
class PunktParameters:
|
| 336 |
+
"""Stores data used to perform sentence boundary detection with Punkt."""
|
| 337 |
+
|
| 338 |
+
def __init__(self):
|
| 339 |
+
self.abbrev_types = set()
|
| 340 |
+
"""A set of word types for known abbreviations."""
|
| 341 |
+
|
| 342 |
+
self.collocations = set()
|
| 343 |
+
"""A set of word type tuples for known common collocations
|
| 344 |
+
where the first word ends in a period. E.g., ('S.', 'Bach')
|
| 345 |
+
is a common collocation in a text that discusses 'Johann
|
| 346 |
+
S. Bach'. These count as negative evidence for sentence
|
| 347 |
+
boundaries."""
|
| 348 |
+
|
| 349 |
+
self.sent_starters = set()
|
| 350 |
+
"""A set of word types for words that often appear at the
|
| 351 |
+
beginning of sentences."""
|
| 352 |
+
|
| 353 |
+
self.ortho_context = defaultdict(int)
|
| 354 |
+
"""A dictionary mapping word types to the set of orthographic
|
| 355 |
+
contexts that word type appears in. Contexts are represented
|
| 356 |
+
by adding orthographic context flags: ..."""
|
| 357 |
+
|
| 358 |
+
def clear_abbrevs(self):
|
| 359 |
+
self.abbrev_types = set()
|
| 360 |
+
|
| 361 |
+
def clear_collocations(self):
|
| 362 |
+
self.collocations = set()
|
| 363 |
+
|
| 364 |
+
def clear_sent_starters(self):
|
| 365 |
+
self.sent_starters = set()
|
| 366 |
+
|
| 367 |
+
def clear_ortho_context(self):
|
| 368 |
+
self.ortho_context = defaultdict(int)
|
| 369 |
+
|
| 370 |
+
def add_ortho_context(self, typ, flag):
|
| 371 |
+
self.ortho_context[typ] |= flag
|
| 372 |
+
|
| 373 |
+
def _debug_ortho_context(self, typ):
|
| 374 |
+
context = self.ortho_context[typ]
|
| 375 |
+
if context & _ORTHO_BEG_UC:
|
| 376 |
+
yield "BEG-UC"
|
| 377 |
+
if context & _ORTHO_MID_UC:
|
| 378 |
+
yield "MID-UC"
|
| 379 |
+
if context & _ORTHO_UNK_UC:
|
| 380 |
+
yield "UNK-UC"
|
| 381 |
+
if context & _ORTHO_BEG_LC:
|
| 382 |
+
yield "BEG-LC"
|
| 383 |
+
if context & _ORTHO_MID_LC:
|
| 384 |
+
yield "MID-LC"
|
| 385 |
+
if context & _ORTHO_UNK_LC:
|
| 386 |
+
yield "UNK-LC"
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
######################################################################
|
| 390 |
+
# { PunktToken
|
| 391 |
+
######################################################################
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
class PunktToken:
|
| 395 |
+
"""Stores a token of text with annotations produced during
|
| 396 |
+
sentence boundary detection."""
|
| 397 |
+
|
| 398 |
+
_properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"]
|
| 399 |
+
__slots__ = ["tok", "type", "period_final"] + _properties
|
| 400 |
+
|
| 401 |
+
def __init__(self, tok, **params):
|
| 402 |
+
self.tok = tok
|
| 403 |
+
self.type = self._get_type(tok)
|
| 404 |
+
self.period_final = tok.endswith(".")
|
| 405 |
+
|
| 406 |
+
for prop in self._properties:
|
| 407 |
+
setattr(self, prop, None)
|
| 408 |
+
for k in params:
|
| 409 |
+
setattr(self, k, params[k])
|
| 410 |
+
|
| 411 |
+
# ////////////////////////////////////////////////////////////
|
| 412 |
+
# { Regular expressions for properties
|
| 413 |
+
# ////////////////////////////////////////////////////////////
|
| 414 |
+
# Note: [A-Za-z] is approximated by [^\W\d] in the general case.
|
| 415 |
+
_RE_ELLIPSIS = re.compile(r"\.\.+$")
|
| 416 |
+
_RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$")
|
| 417 |
+
_RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE)
|
| 418 |
+
_RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE)
|
| 419 |
+
|
| 420 |
+
# ////////////////////////////////////////////////////////////
|
| 421 |
+
# { Derived properties
|
| 422 |
+
# ////////////////////////////////////////////////////////////
|
| 423 |
+
|
| 424 |
+
def _get_type(self, tok):
|
| 425 |
+
"""Returns a case-normalized representation of the token."""
|
| 426 |
+
return self._RE_NUMERIC.sub("##number##", tok.lower())
|
| 427 |
+
|
| 428 |
+
@property
|
| 429 |
+
def type_no_period(self):
|
| 430 |
+
"""
|
| 431 |
+
The type with its final period removed if it has one.
|
| 432 |
+
"""
|
| 433 |
+
if len(self.type) > 1 and self.type[-1] == ".":
|
| 434 |
+
return self.type[:-1]
|
| 435 |
+
return self.type
|
| 436 |
+
|
| 437 |
+
@property
|
| 438 |
+
def type_no_sentperiod(self):
|
| 439 |
+
"""
|
| 440 |
+
The type with its final period removed if it is marked as a
|
| 441 |
+
sentence break.
|
| 442 |
+
"""
|
| 443 |
+
if self.sentbreak:
|
| 444 |
+
return self.type_no_period
|
| 445 |
+
return self.type
|
| 446 |
+
|
| 447 |
+
@property
|
| 448 |
+
def first_upper(self):
|
| 449 |
+
"""True if the token's first character is uppercase."""
|
| 450 |
+
return self.tok[0].isupper()
|
| 451 |
+
|
| 452 |
+
@property
|
| 453 |
+
def first_lower(self):
|
| 454 |
+
"""True if the token's first character is lowercase."""
|
| 455 |
+
return self.tok[0].islower()
|
| 456 |
+
|
| 457 |
+
@property
|
| 458 |
+
def first_case(self):
|
| 459 |
+
if self.first_lower:
|
| 460 |
+
return "lower"
|
| 461 |
+
if self.first_upper:
|
| 462 |
+
return "upper"
|
| 463 |
+
return "none"
|
| 464 |
+
|
| 465 |
+
@property
|
| 466 |
+
def is_ellipsis(self):
|
| 467 |
+
"""True if the token text is that of an ellipsis."""
|
| 468 |
+
return self._RE_ELLIPSIS.match(self.tok)
|
| 469 |
+
|
| 470 |
+
@property
|
| 471 |
+
def is_number(self):
|
| 472 |
+
"""True if the token text is that of a number."""
|
| 473 |
+
return self.type.startswith("##number##")
|
| 474 |
+
|
| 475 |
+
@property
|
| 476 |
+
def is_initial(self):
|
| 477 |
+
"""True if the token text is that of an initial."""
|
| 478 |
+
return self._RE_INITIAL.match(self.tok)
|
| 479 |
+
|
| 480 |
+
@property
|
| 481 |
+
def is_alpha(self):
|
| 482 |
+
"""True if the token text is all alphabetic."""
|
| 483 |
+
return self._RE_ALPHA.match(self.tok)
|
| 484 |
+
|
| 485 |
+
@property
|
| 486 |
+
def is_non_punct(self):
|
| 487 |
+
"""True if the token is either a number or is alphabetic."""
|
| 488 |
+
return _re_non_punct.search(self.type)
|
| 489 |
+
|
| 490 |
+
# ////////////////////////////////////////////////////////////
|
| 491 |
+
# { String representation
|
| 492 |
+
# ////////////////////////////////////////////////////////////
|
| 493 |
+
|
| 494 |
+
def __repr__(self):
|
| 495 |
+
"""
|
| 496 |
+
A string representation of the token that can reproduce it
|
| 497 |
+
with eval(), which lists all the token's non-default
|
| 498 |
+
annotations.
|
| 499 |
+
"""
|
| 500 |
+
typestr = " type=%s," % repr(self.type) if self.type != self.tok else ""
|
| 501 |
+
|
| 502 |
+
propvals = ", ".join(
|
| 503 |
+
f"{p}={repr(getattr(self, p))}"
|
| 504 |
+
for p in self._properties
|
| 505 |
+
if getattr(self, p)
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
return "{}({},{} {})".format(
|
| 509 |
+
self.__class__.__name__,
|
| 510 |
+
repr(self.tok),
|
| 511 |
+
typestr,
|
| 512 |
+
propvals,
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
def __str__(self):
|
| 516 |
+
"""
|
| 517 |
+
A string representation akin to that used by Kiss and Strunk.
|
| 518 |
+
"""
|
| 519 |
+
res = self.tok
|
| 520 |
+
if self.abbr:
|
| 521 |
+
res += "<A>"
|
| 522 |
+
if self.ellipsis:
|
| 523 |
+
res += "<E>"
|
| 524 |
+
if self.sentbreak:
|
| 525 |
+
res += "<S>"
|
| 526 |
+
return res
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
######################################################################
|
| 530 |
+
# { Punkt base class
|
| 531 |
+
######################################################################
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
class PunktBaseClass:
|
| 535 |
+
"""
|
| 536 |
+
Includes common components of PunktTrainer and PunktSentenceTokenizer.
|
| 537 |
+
"""
|
| 538 |
+
|
| 539 |
+
def __init__(self, lang_vars=None, token_cls=PunktToken, params=None):
|
| 540 |
+
if lang_vars is None:
|
| 541 |
+
lang_vars = PunktLanguageVars()
|
| 542 |
+
if params is None:
|
| 543 |
+
params = PunktParameters()
|
| 544 |
+
self._params = params
|
| 545 |
+
self._lang_vars = lang_vars
|
| 546 |
+
self._Token = token_cls
|
| 547 |
+
"""The collection of parameters that determines the behavior
|
| 548 |
+
of the punkt tokenizer."""
|
| 549 |
+
|
| 550 |
+
# ////////////////////////////////////////////////////////////
|
| 551 |
+
# { Word tokenization
|
| 552 |
+
# ////////////////////////////////////////////////////////////
|
| 553 |
+
|
| 554 |
+
def _tokenize_words(self, plaintext):
|
| 555 |
+
"""
|
| 556 |
+
Divide the given text into tokens, using the punkt word
|
| 557 |
+
segmentation regular expression, and generate the resulting list
|
| 558 |
+
of tokens augmented as three-tuples with two boolean values for whether
|
| 559 |
+
the given token occurs at the start of a paragraph or a new line,
|
| 560 |
+
respectively.
|
| 561 |
+
"""
|
| 562 |
+
parastart = False
|
| 563 |
+
for line in plaintext.split("\n"):
|
| 564 |
+
if line.strip():
|
| 565 |
+
line_toks = iter(self._lang_vars.word_tokenize(line))
|
| 566 |
+
|
| 567 |
+
try:
|
| 568 |
+
tok = next(line_toks)
|
| 569 |
+
except StopIteration:
|
| 570 |
+
continue
|
| 571 |
+
|
| 572 |
+
yield self._Token(tok, parastart=parastart, linestart=True)
|
| 573 |
+
parastart = False
|
| 574 |
+
|
| 575 |
+
for tok in line_toks:
|
| 576 |
+
yield self._Token(tok)
|
| 577 |
+
else:
|
| 578 |
+
parastart = True
|
| 579 |
+
|
| 580 |
+
# ////////////////////////////////////////////////////////////
|
| 581 |
+
# { Annotation Procedures
|
| 582 |
+
# ////////////////////////////////////////////////////////////
|
| 583 |
+
|
| 584 |
+
def _annotate_first_pass(
|
| 585 |
+
self, tokens: Iterator[PunktToken]
|
| 586 |
+
) -> Iterator[PunktToken]:
|
| 587 |
+
"""
|
| 588 |
+
Perform the first pass of annotation, which makes decisions
|
| 589 |
+
based purely based on the word type of each word:
|
| 590 |
+
|
| 591 |
+
- '?', '!', and '.' are marked as sentence breaks.
|
| 592 |
+
- sequences of two or more periods are marked as ellipsis.
|
| 593 |
+
- any word ending in '.' that's a known abbreviation is
|
| 594 |
+
marked as an abbreviation.
|
| 595 |
+
- any other word ending in '.' is marked as a sentence break.
|
| 596 |
+
|
| 597 |
+
Return these annotations as a tuple of three sets:
|
| 598 |
+
|
| 599 |
+
- sentbreak_toks: The indices of all sentence breaks.
|
| 600 |
+
- abbrev_toks: The indices of all abbreviations.
|
| 601 |
+
- ellipsis_toks: The indices of all ellipsis marks.
|
| 602 |
+
"""
|
| 603 |
+
for aug_tok in tokens:
|
| 604 |
+
self._first_pass_annotation(aug_tok)
|
| 605 |
+
yield aug_tok
|
| 606 |
+
|
| 607 |
+
def _first_pass_annotation(self, aug_tok: PunktToken) -> None:
|
| 608 |
+
"""
|
| 609 |
+
Performs type-based annotation on a single token.
|
| 610 |
+
"""
|
| 611 |
+
|
| 612 |
+
tok = aug_tok.tok
|
| 613 |
+
|
| 614 |
+
if tok in self._lang_vars.sent_end_chars:
|
| 615 |
+
aug_tok.sentbreak = True
|
| 616 |
+
elif aug_tok.is_ellipsis:
|
| 617 |
+
aug_tok.ellipsis = True
|
| 618 |
+
elif aug_tok.period_final and not tok.endswith(".."):
|
| 619 |
+
if (
|
| 620 |
+
tok[:-1].lower() in self._params.abbrev_types
|
| 621 |
+
or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types
|
| 622 |
+
):
|
| 623 |
+
|
| 624 |
+
aug_tok.abbr = True
|
| 625 |
+
else:
|
| 626 |
+
aug_tok.sentbreak = True
|
| 627 |
+
|
| 628 |
+
return
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
######################################################################
|
| 632 |
+
# { Punkt Trainer
|
| 633 |
+
######################################################################
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
class PunktTrainer(PunktBaseClass):
|
| 637 |
+
"""Learns parameters used in Punkt sentence boundary detection."""
|
| 638 |
+
|
| 639 |
+
def __init__(
|
| 640 |
+
self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken
|
| 641 |
+
):
|
| 642 |
+
|
| 643 |
+
PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls)
|
| 644 |
+
|
| 645 |
+
self._type_fdist = FreqDist()
|
| 646 |
+
"""A frequency distribution giving the frequency of each
|
| 647 |
+
case-normalized token type in the training data."""
|
| 648 |
+
|
| 649 |
+
self._num_period_toks = 0
|
| 650 |
+
"""The number of words ending in period in the training data."""
|
| 651 |
+
|
| 652 |
+
self._collocation_fdist = FreqDist()
|
| 653 |
+
"""A frequency distribution giving the frequency of all
|
| 654 |
+
bigrams in the training data where the first word ends in a
|
| 655 |
+
period. Bigrams are encoded as tuples of word types.
|
| 656 |
+
Especially common collocations are extracted from this
|
| 657 |
+
frequency distribution, and stored in
|
| 658 |
+
``_params``.``collocations <PunktParameters.collocations>``."""
|
| 659 |
+
|
| 660 |
+
self._sent_starter_fdist = FreqDist()
|
| 661 |
+
"""A frequency distribution giving the frequency of all words
|
| 662 |
+
that occur at the training data at the beginning of a sentence
|
| 663 |
+
(after the first pass of annotation). Especially common
|
| 664 |
+
sentence starters are extracted from this frequency
|
| 665 |
+
distribution, and stored in ``_params.sent_starters``.
|
| 666 |
+
"""
|
| 667 |
+
|
| 668 |
+
self._sentbreak_count = 0
|
| 669 |
+
"""The total number of sentence breaks identified in training, used for
|
| 670 |
+
calculating the frequent sentence starter heuristic."""
|
| 671 |
+
|
| 672 |
+
self._finalized = True
|
| 673 |
+
"""A flag as to whether the training has been finalized by finding
|
| 674 |
+
collocations and sentence starters, or whether finalize_training()
|
| 675 |
+
still needs to be called."""
|
| 676 |
+
|
| 677 |
+
if train_text:
|
| 678 |
+
self.train(train_text, verbose, finalize=True)
|
| 679 |
+
|
| 680 |
+
def get_params(self):
|
| 681 |
+
"""
|
| 682 |
+
Calculates and returns parameters for sentence boundary detection as
|
| 683 |
+
derived from training."""
|
| 684 |
+
if not self._finalized:
|
| 685 |
+
self.finalize_training()
|
| 686 |
+
return self._params
|
| 687 |
+
|
| 688 |
+
# ////////////////////////////////////////////////////////////
|
| 689 |
+
# { Customization Variables
|
| 690 |
+
# ////////////////////////////////////////////////////////////
|
| 691 |
+
|
| 692 |
+
ABBREV = 0.3
|
| 693 |
+
"""cut-off value whether a 'token' is an abbreviation"""
|
| 694 |
+
|
| 695 |
+
IGNORE_ABBREV_PENALTY = False
|
| 696 |
+
"""allows the disabling of the abbreviation penalty heuristic, which
|
| 697 |
+
exponentially disadvantages words that are found at times without a
|
| 698 |
+
final period."""
|
| 699 |
+
|
| 700 |
+
ABBREV_BACKOFF = 5
|
| 701 |
+
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""
|
| 702 |
+
|
| 703 |
+
COLLOCATION = 7.88
|
| 704 |
+
"""minimal log-likelihood value that two tokens need to be considered
|
| 705 |
+
as a collocation"""
|
| 706 |
+
|
| 707 |
+
SENT_STARTER = 30
|
| 708 |
+
"""minimal log-likelihood value that a token requires to be considered
|
| 709 |
+
as a frequent sentence starter"""
|
| 710 |
+
|
| 711 |
+
INCLUDE_ALL_COLLOCS = False
|
| 712 |
+
"""this includes as potential collocations all word pairs where the first
|
| 713 |
+
word ends in a period. It may be useful in corpora where there is a lot
|
| 714 |
+
of variation that makes abbreviations like Mr difficult to identify."""
|
| 715 |
+
|
| 716 |
+
INCLUDE_ABBREV_COLLOCS = False
|
| 717 |
+
"""this includes as potential collocations all word pairs where the first
|
| 718 |
+
word is an abbreviation. Such collocations override the orthographic
|
| 719 |
+
heuristic, but not the sentence starter heuristic. This is overridden by
|
| 720 |
+
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
|
| 721 |
+
and ordinals are considered."""
|
| 722 |
+
""""""
|
| 723 |
+
|
| 724 |
+
MIN_COLLOC_FREQ = 1
|
| 725 |
+
"""this sets a minimum bound on the number of times a bigram needs to
|
| 726 |
+
appear before it can be considered a collocation, in addition to log
|
| 727 |
+
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""
|
| 728 |
+
|
| 729 |
+
# ////////////////////////////////////////////////////////////
|
| 730 |
+
# { Training..
|
| 731 |
+
# ////////////////////////////////////////////////////////////
|
| 732 |
+
|
| 733 |
+
def train(self, text, verbose=False, finalize=True):
|
| 734 |
+
"""
|
| 735 |
+
Collects training data from a given text. If finalize is True, it
|
| 736 |
+
will determine all the parameters for sentence boundary detection. If
|
| 737 |
+
not, this will be delayed until get_params() or finalize_training() is
|
| 738 |
+
called. If verbose is True, abbreviations found will be listed.
|
| 739 |
+
"""
|
| 740 |
+
# Break the text into tokens; record which token indices correspond to
|
| 741 |
+
# line starts and paragraph starts; and determine their types.
|
| 742 |
+
self._train_tokens(self._tokenize_words(text), verbose)
|
| 743 |
+
if finalize:
|
| 744 |
+
self.finalize_training(verbose)
|
| 745 |
+
|
| 746 |
+
def train_tokens(self, tokens, verbose=False, finalize=True):
|
| 747 |
+
"""
|
| 748 |
+
Collects training data from a given list of tokens.
|
| 749 |
+
"""
|
| 750 |
+
self._train_tokens((self._Token(t) for t in tokens), verbose)
|
| 751 |
+
if finalize:
|
| 752 |
+
self.finalize_training(verbose)
|
| 753 |
+
|
| 754 |
+
def _train_tokens(self, tokens, verbose):
|
| 755 |
+
self._finalized = False
|
| 756 |
+
|
| 757 |
+
# Ensure tokens are a list
|
| 758 |
+
tokens = list(tokens)
|
| 759 |
+
|
| 760 |
+
# Find the frequency of each case-normalized type. (Don't
|
| 761 |
+
# strip off final periods.) Also keep track of the number of
|
| 762 |
+
# tokens that end in periods.
|
| 763 |
+
for aug_tok in tokens:
|
| 764 |
+
self._type_fdist[aug_tok.type] += 1
|
| 765 |
+
if aug_tok.period_final:
|
| 766 |
+
self._num_period_toks += 1
|
| 767 |
+
|
| 768 |
+
# Look for new abbreviations, and for types that no longer are
|
| 769 |
+
unique_types = self._unique_types(tokens)
|
| 770 |
+
for abbr, score, is_add in self._reclassify_abbrev_types(unique_types):
|
| 771 |
+
if score >= self.ABBREV:
|
| 772 |
+
if is_add:
|
| 773 |
+
self._params.abbrev_types.add(abbr)
|
| 774 |
+
if verbose:
|
| 775 |
+
print(f" Abbreviation: [{score:6.4f}] {abbr}")
|
| 776 |
+
else:
|
| 777 |
+
if not is_add:
|
| 778 |
+
self._params.abbrev_types.remove(abbr)
|
| 779 |
+
if verbose:
|
| 780 |
+
print(f" Removed abbreviation: [{score:6.4f}] {abbr}")
|
| 781 |
+
|
| 782 |
+
# Make a preliminary pass through the document, marking likely
|
| 783 |
+
# sentence breaks, abbreviations, and ellipsis tokens.
|
| 784 |
+
tokens = list(self._annotate_first_pass(tokens))
|
| 785 |
+
|
| 786 |
+
# Check what contexts each word type can appear in, given the
|
| 787 |
+
# case of its first letter.
|
| 788 |
+
self._get_orthography_data(tokens)
|
| 789 |
+
|
| 790 |
+
# We need total number of sentence breaks to find sentence starters
|
| 791 |
+
self._sentbreak_count += self._get_sentbreak_count(tokens)
|
| 792 |
+
|
| 793 |
+
# The remaining heuristics relate to pairs of tokens where the first
|
| 794 |
+
# ends in a period.
|
| 795 |
+
for aug_tok1, aug_tok2 in _pair_iter(tokens):
|
| 796 |
+
if not aug_tok1.period_final or not aug_tok2:
|
| 797 |
+
continue
|
| 798 |
+
|
| 799 |
+
# Is the first token a rare abbreviation?
|
| 800 |
+
if self._is_rare_abbrev_type(aug_tok1, aug_tok2):
|
| 801 |
+
self._params.abbrev_types.add(aug_tok1.type_no_period)
|
| 802 |
+
if verbose:
|
| 803 |
+
print(" Rare Abbrev: %s" % aug_tok1.type)
|
| 804 |
+
|
| 805 |
+
# Does second token have a high likelihood of starting a sentence?
|
| 806 |
+
if self._is_potential_sent_starter(aug_tok2, aug_tok1):
|
| 807 |
+
self._sent_starter_fdist[aug_tok2.type] += 1
|
| 808 |
+
|
| 809 |
+
# Is this bigram a potential collocation?
|
| 810 |
+
if self._is_potential_collocation(aug_tok1, aug_tok2):
|
| 811 |
+
self._collocation_fdist[
|
| 812 |
+
(aug_tok1.type_no_period, aug_tok2.type_no_sentperiod)
|
| 813 |
+
] += 1
|
| 814 |
+
|
| 815 |
+
def _unique_types(self, tokens):
|
| 816 |
+
return {aug_tok.type for aug_tok in tokens}
|
| 817 |
+
|
| 818 |
+
def finalize_training(self, verbose=False):
|
| 819 |
+
"""
|
| 820 |
+
Uses data that has been gathered in training to determine likely
|
| 821 |
+
collocations and sentence starters.
|
| 822 |
+
"""
|
| 823 |
+
self._params.clear_sent_starters()
|
| 824 |
+
for typ, log_likelihood in self._find_sent_starters():
|
| 825 |
+
self._params.sent_starters.add(typ)
|
| 826 |
+
if verbose:
|
| 827 |
+
print(f" Sent Starter: [{log_likelihood:6.4f}] {typ!r}")
|
| 828 |
+
|
| 829 |
+
self._params.clear_collocations()
|
| 830 |
+
for (typ1, typ2), log_likelihood in self._find_collocations():
|
| 831 |
+
self._params.collocations.add((typ1, typ2))
|
| 832 |
+
if verbose:
|
| 833 |
+
print(f" Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}")
|
| 834 |
+
|
| 835 |
+
self._finalized = True
|
| 836 |
+
|
| 837 |
+
# ////////////////////////////////////////////////////////////
|
| 838 |
+
# { Overhead reduction
|
| 839 |
+
# ////////////////////////////////////////////////////////////
|
| 840 |
+
|
| 841 |
+
def freq_threshold(
|
| 842 |
+
self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2
|
| 843 |
+
):
|
| 844 |
+
"""
|
| 845 |
+
Allows memory use to be reduced after much training by removing data
|
| 846 |
+
about rare tokens that are unlikely to have a statistical effect with
|
| 847 |
+
further training. Entries occurring above the given thresholds will be
|
| 848 |
+
retained.
|
| 849 |
+
"""
|
| 850 |
+
if ortho_thresh > 1:
|
| 851 |
+
old_oc = self._params.ortho_context
|
| 852 |
+
self._params.clear_ortho_context()
|
| 853 |
+
for tok in self._type_fdist:
|
| 854 |
+
count = self._type_fdist[tok]
|
| 855 |
+
if count >= ortho_thresh:
|
| 856 |
+
self._params.ortho_context[tok] = old_oc[tok]
|
| 857 |
+
|
| 858 |
+
self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh)
|
| 859 |
+
self._collocation_fdist = self._freq_threshold(
|
| 860 |
+
self._collocation_fdist, colloc_thres
|
| 861 |
+
)
|
| 862 |
+
self._sent_starter_fdist = self._freq_threshold(
|
| 863 |
+
self._sent_starter_fdist, sentstart_thresh
|
| 864 |
+
)
|
| 865 |
+
|
| 866 |
+
def _freq_threshold(self, fdist, threshold):
|
| 867 |
+
"""
|
| 868 |
+
Returns a FreqDist containing only data with counts below a given
|
| 869 |
+
threshold, as well as a mapping (None -> count_removed).
|
| 870 |
+
"""
|
| 871 |
+
# We assume that there is more data below the threshold than above it
|
| 872 |
+
# and so create a new FreqDist rather than working in place.
|
| 873 |
+
res = FreqDist()
|
| 874 |
+
num_removed = 0
|
| 875 |
+
for tok in fdist:
|
| 876 |
+
count = fdist[tok]
|
| 877 |
+
if count < threshold:
|
| 878 |
+
num_removed += 1
|
| 879 |
+
else:
|
| 880 |
+
res[tok] += count
|
| 881 |
+
res[None] += num_removed
|
| 882 |
+
return res
|
| 883 |
+
|
| 884 |
+
# ////////////////////////////////////////////////////////////
|
| 885 |
+
# { Orthographic data
|
| 886 |
+
# ////////////////////////////////////////////////////////////
|
| 887 |
+
|
| 888 |
+
def _get_orthography_data(self, tokens):
|
| 889 |
+
"""
|
| 890 |
+
Collect information about whether each token type occurs
|
| 891 |
+
with different case patterns (i) overall, (ii) at
|
| 892 |
+
sentence-initial positions, and (iii) at sentence-internal
|
| 893 |
+
positions.
|
| 894 |
+
"""
|
| 895 |
+
# 'initial' or 'internal' or 'unknown'
|
| 896 |
+
context = "internal"
|
| 897 |
+
tokens = list(tokens)
|
| 898 |
+
|
| 899 |
+
for aug_tok in tokens:
|
| 900 |
+
# If we encounter a paragraph break, then it's a good sign
|
| 901 |
+
# that it's a sentence break. But err on the side of
|
| 902 |
+
# caution (by not positing a sentence break) if we just
|
| 903 |
+
# saw an abbreviation.
|
| 904 |
+
if aug_tok.parastart and context != "unknown":
|
| 905 |
+
context = "initial"
|
| 906 |
+
|
| 907 |
+
# If we're at the beginning of a line, then we can't decide
|
| 908 |
+
# between 'internal' and 'initial'.
|
| 909 |
+
if aug_tok.linestart and context == "internal":
|
| 910 |
+
context = "unknown"
|
| 911 |
+
|
| 912 |
+
# Find the case-normalized type of the token. If it's a
|
| 913 |
+
# sentence-final token, strip off the period.
|
| 914 |
+
typ = aug_tok.type_no_sentperiod
|
| 915 |
+
|
| 916 |
+
# Update the orthographic context table.
|
| 917 |
+
flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0)
|
| 918 |
+
if flag:
|
| 919 |
+
self._params.add_ortho_context(typ, flag)
|
| 920 |
+
|
| 921 |
+
# Decide whether the next word is at a sentence boundary.
|
| 922 |
+
if aug_tok.sentbreak:
|
| 923 |
+
if not (aug_tok.is_number or aug_tok.is_initial):
|
| 924 |
+
context = "initial"
|
| 925 |
+
else:
|
| 926 |
+
context = "unknown"
|
| 927 |
+
elif aug_tok.ellipsis or aug_tok.abbr:
|
| 928 |
+
context = "unknown"
|
| 929 |
+
else:
|
| 930 |
+
context = "internal"
|
| 931 |
+
|
| 932 |
+
# ////////////////////////////////////////////////////////////
|
| 933 |
+
# { Abbreviations
|
| 934 |
+
# ////////////////////////////////////////////////////////////
|
| 935 |
+
|
| 936 |
+
def _reclassify_abbrev_types(self, types):
|
| 937 |
+
"""
|
| 938 |
+
(Re)classifies each given token if
|
| 939 |
+
- it is period-final and not a known abbreviation; or
|
| 940 |
+
- it is not period-final and is otherwise a known abbreviation
|
| 941 |
+
by checking whether its previous classification still holds according
|
| 942 |
+
to the heuristics of section 3.
|
| 943 |
+
Yields triples (abbr, score, is_add) where abbr is the type in question,
|
| 944 |
+
score is its log-likelihood with penalties applied, and is_add specifies
|
| 945 |
+
whether the present type is a candidate for inclusion or exclusion as an
|
| 946 |
+
abbreviation, such that:
|
| 947 |
+
- (is_add and score >= 0.3) suggests a new abbreviation; and
|
| 948 |
+
- (not is_add and score < 0.3) suggests excluding an abbreviation.
|
| 949 |
+
"""
|
| 950 |
+
# (While one could recalculate abbreviations from all .-final tokens at
|
| 951 |
+
# every iteration, in cases requiring efficiency, the number of tokens
|
| 952 |
+
# in the present training document will be much less.)
|
| 953 |
+
|
| 954 |
+
for typ in types:
|
| 955 |
+
# Check some basic conditions, to rule out words that are
|
| 956 |
+
# clearly not abbrev_types.
|
| 957 |
+
if not _re_non_punct.search(typ) or typ == "##number##":
|
| 958 |
+
continue
|
| 959 |
+
|
| 960 |
+
if typ.endswith("."):
|
| 961 |
+
if typ in self._params.abbrev_types:
|
| 962 |
+
continue
|
| 963 |
+
typ = typ[:-1]
|
| 964 |
+
is_add = True
|
| 965 |
+
else:
|
| 966 |
+
if typ not in self._params.abbrev_types:
|
| 967 |
+
continue
|
| 968 |
+
is_add = False
|
| 969 |
+
|
| 970 |
+
# Count how many periods & nonperiods are in the
|
| 971 |
+
# candidate.
|
| 972 |
+
num_periods = typ.count(".") + 1
|
| 973 |
+
num_nonperiods = len(typ) - num_periods + 1
|
| 974 |
+
|
| 975 |
+
# Let <a> be the candidate without the period, and <b>
|
| 976 |
+
# be the period. Find a log likelihood ratio that
|
| 977 |
+
# indicates whether <ab> occurs as a single unit (high
|
| 978 |
+
# value of log_likelihood), or as two independent units <a> and
|
| 979 |
+
# <b> (low value of log_likelihood).
|
| 980 |
+
count_with_period = self._type_fdist[typ + "."]
|
| 981 |
+
count_without_period = self._type_fdist[typ]
|
| 982 |
+
log_likelihood = self._dunning_log_likelihood(
|
| 983 |
+
count_with_period + count_without_period,
|
| 984 |
+
self._num_period_toks,
|
| 985 |
+
count_with_period,
|
| 986 |
+
self._type_fdist.N(),
|
| 987 |
+
)
|
| 988 |
+
|
| 989 |
+
# Apply three scaling factors to 'tweak' the basic log
|
| 990 |
+
# likelihood ratio:
|
| 991 |
+
# F_length: long word -> less likely to be an abbrev
|
| 992 |
+
# F_periods: more periods -> more likely to be an abbrev
|
| 993 |
+
# F_penalty: penalize occurrences w/o a period
|
| 994 |
+
f_length = math.exp(-num_nonperiods)
|
| 995 |
+
f_periods = num_periods
|
| 996 |
+
f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow(
|
| 997 |
+
num_nonperiods, -count_without_period
|
| 998 |
+
)
|
| 999 |
+
score = log_likelihood * f_length * f_periods * f_penalty
|
| 1000 |
+
|
| 1001 |
+
yield typ, score, is_add
|
| 1002 |
+
|
| 1003 |
+
def find_abbrev_types(self):
|
| 1004 |
+
"""
|
| 1005 |
+
Recalculates abbreviations given type frequencies, despite no prior
|
| 1006 |
+
determination of abbreviations.
|
| 1007 |
+
This fails to include abbreviations otherwise found as "rare".
|
| 1008 |
+
"""
|
| 1009 |
+
self._params.clear_abbrevs()
|
| 1010 |
+
tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
|
| 1011 |
+
for abbr, score, _is_add in self._reclassify_abbrev_types(tokens):
|
| 1012 |
+
if score >= self.ABBREV:
|
| 1013 |
+
self._params.abbrev_types.add(abbr)
|
| 1014 |
+
|
| 1015 |
+
# This function combines the work done by the original code's
|
| 1016 |
+
# functions `count_orthography_context`, `get_orthography_count`,
|
| 1017 |
+
# and `get_rare_abbreviations`.
|
| 1018 |
+
def _is_rare_abbrev_type(self, cur_tok, next_tok):
|
| 1019 |
+
"""
|
| 1020 |
+
A word type is counted as a rare abbreviation if...
|
| 1021 |
+
- it's not already marked as an abbreviation
|
| 1022 |
+
- it occurs fewer than ABBREV_BACKOFF times
|
| 1023 |
+
- either it is followed by a sentence-internal punctuation
|
| 1024 |
+
mark, *or* it is followed by a lower-case word that
|
| 1025 |
+
sometimes appears with upper case, but never occurs with
|
| 1026 |
+
lower case at the beginning of sentences.
|
| 1027 |
+
"""
|
| 1028 |
+
if cur_tok.abbr or not cur_tok.sentbreak:
|
| 1029 |
+
return False
|
| 1030 |
+
|
| 1031 |
+
# Find the case-normalized type of the token. If it's
|
| 1032 |
+
# a sentence-final token, strip off the period.
|
| 1033 |
+
typ = cur_tok.type_no_sentperiod
|
| 1034 |
+
|
| 1035 |
+
# Proceed only if the type hasn't been categorized as an
|
| 1036 |
+
# abbreviation already, and is sufficiently rare...
|
| 1037 |
+
count = self._type_fdist[typ] + self._type_fdist[typ[:-1]]
|
| 1038 |
+
if typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF:
|
| 1039 |
+
return False
|
| 1040 |
+
|
| 1041 |
+
# Record this token as an abbreviation if the next
|
| 1042 |
+
# token is a sentence-internal punctuation mark.
|
| 1043 |
+
# [XX] :1 or check the whole thing??
|
| 1044 |
+
if next_tok.tok[:1] in self._lang_vars.internal_punctuation:
|
| 1045 |
+
return True
|
| 1046 |
+
|
| 1047 |
+
# Record this type as an abbreviation if the next
|
| 1048 |
+
# token... (i) starts with a lower case letter,
|
| 1049 |
+
# (ii) sometimes occurs with an uppercase letter,
|
| 1050 |
+
# and (iii) never occus with an uppercase letter
|
| 1051 |
+
# sentence-internally.
|
| 1052 |
+
# [xx] should the check for (ii) be modified??
|
| 1053 |
+
if next_tok.first_lower:
|
| 1054 |
+
typ2 = next_tok.type_no_sentperiod
|
| 1055 |
+
typ2ortho_context = self._params.ortho_context[typ2]
|
| 1056 |
+
if (typ2ortho_context & _ORTHO_BEG_UC) and not (
|
| 1057 |
+
typ2ortho_context & _ORTHO_MID_UC
|
| 1058 |
+
):
|
| 1059 |
+
return True
|
| 1060 |
+
|
| 1061 |
+
# ////////////////////////////////////////////////////////////
|
| 1062 |
+
# { Log Likelihoods
|
| 1063 |
+
# ////////////////////////////////////////////////////////////
|
| 1064 |
+
|
| 1065 |
+
# helper for _reclassify_abbrev_types:
|
| 1066 |
+
@staticmethod
|
| 1067 |
+
def _dunning_log_likelihood(count_a, count_b, count_ab, N):
|
| 1068 |
+
"""
|
| 1069 |
+
A function that calculates the modified Dunning log-likelihood
|
| 1070 |
+
ratio scores for abbreviation candidates. The details of how
|
| 1071 |
+
this works is available in the paper.
|
| 1072 |
+
"""
|
| 1073 |
+
p1 = count_b / N
|
| 1074 |
+
p2 = 0.99
|
| 1075 |
+
|
| 1076 |
+
null_hypo = count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1)
|
| 1077 |
+
alt_hypo = count_ab * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2)
|
| 1078 |
+
|
| 1079 |
+
likelihood = null_hypo - alt_hypo
|
| 1080 |
+
|
| 1081 |
+
return -2.0 * likelihood
|
| 1082 |
+
|
| 1083 |
+
@staticmethod
|
| 1084 |
+
def _col_log_likelihood(count_a, count_b, count_ab, N):
|
| 1085 |
+
"""
|
| 1086 |
+
A function that will just compute log-likelihood estimate, in
|
| 1087 |
+
the original paper it's described in algorithm 6 and 7.
|
| 1088 |
+
|
| 1089 |
+
This *should* be the original Dunning log-likelihood values,
|
| 1090 |
+
unlike the previous log_l function where it used modified
|
| 1091 |
+
Dunning log-likelihood values
|
| 1092 |
+
"""
|
| 1093 |
+
p = count_b / N
|
| 1094 |
+
p1 = count_ab / count_a
|
| 1095 |
+
try:
|
| 1096 |
+
p2 = (count_b - count_ab) / (N - count_a)
|
| 1097 |
+
except ZeroDivisionError:
|
| 1098 |
+
p2 = 1
|
| 1099 |
+
|
| 1100 |
+
try:
|
| 1101 |
+
summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)
|
| 1102 |
+
except ValueError:
|
| 1103 |
+
summand1 = 0
|
| 1104 |
+
|
| 1105 |
+
try:
|
| 1106 |
+
summand2 = (count_b - count_ab) * math.log(p) + (
|
| 1107 |
+
N - count_a - count_b + count_ab
|
| 1108 |
+
) * math.log(1.0 - p)
|
| 1109 |
+
except ValueError:
|
| 1110 |
+
summand2 = 0
|
| 1111 |
+
|
| 1112 |
+
if count_a == count_ab or p1 <= 0 or p1 >= 1:
|
| 1113 |
+
summand3 = 0
|
| 1114 |
+
else:
|
| 1115 |
+
summand3 = count_ab * math.log(p1) + (count_a - count_ab) * math.log(
|
| 1116 |
+
1.0 - p1
|
| 1117 |
+
)
|
| 1118 |
+
|
| 1119 |
+
if count_b == count_ab or p2 <= 0 or p2 >= 1:
|
| 1120 |
+
summand4 = 0
|
| 1121 |
+
else:
|
| 1122 |
+
summand4 = (count_b - count_ab) * math.log(p2) + (
|
| 1123 |
+
N - count_a - count_b + count_ab
|
| 1124 |
+
) * math.log(1.0 - p2)
|
| 1125 |
+
|
| 1126 |
+
likelihood = summand1 + summand2 - summand3 - summand4
|
| 1127 |
+
|
| 1128 |
+
return -2.0 * likelihood
|
| 1129 |
+
|
| 1130 |
+
# ////////////////////////////////////////////////////////////
|
| 1131 |
+
# { Collocation Finder
|
| 1132 |
+
# ////////////////////////////////////////////////////////////
|
| 1133 |
+
|
| 1134 |
+
def _is_potential_collocation(self, aug_tok1, aug_tok2):
|
| 1135 |
+
"""
|
| 1136 |
+
Returns True if the pair of tokens may form a collocation given
|
| 1137 |
+
log-likelihood statistics.
|
| 1138 |
+
"""
|
| 1139 |
+
return (
|
| 1140 |
+
(
|
| 1141 |
+
self.INCLUDE_ALL_COLLOCS
|
| 1142 |
+
or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr)
|
| 1143 |
+
or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial))
|
| 1144 |
+
)
|
| 1145 |
+
and aug_tok1.is_non_punct
|
| 1146 |
+
and aug_tok2.is_non_punct
|
| 1147 |
+
)
|
| 1148 |
+
|
| 1149 |
+
def _find_collocations(self):
|
| 1150 |
+
"""
|
| 1151 |
+
Generates likely collocations and their log-likelihood.
|
| 1152 |
+
"""
|
| 1153 |
+
for types in self._collocation_fdist:
|
| 1154 |
+
try:
|
| 1155 |
+
typ1, typ2 = types
|
| 1156 |
+
except TypeError:
|
| 1157 |
+
# types may be None after calling freq_threshold()
|
| 1158 |
+
continue
|
| 1159 |
+
if typ2 in self._params.sent_starters:
|
| 1160 |
+
continue
|
| 1161 |
+
|
| 1162 |
+
col_count = self._collocation_fdist[types]
|
| 1163 |
+
typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."]
|
| 1164 |
+
typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."]
|
| 1165 |
+
if (
|
| 1166 |
+
typ1_count > 1
|
| 1167 |
+
and typ2_count > 1
|
| 1168 |
+
and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)
|
| 1169 |
+
):
|
| 1170 |
+
|
| 1171 |
+
log_likelihood = self._col_log_likelihood(
|
| 1172 |
+
typ1_count, typ2_count, col_count, self._type_fdist.N()
|
| 1173 |
+
)
|
| 1174 |
+
# Filter out the not-so-collocative
|
| 1175 |
+
if log_likelihood >= self.COLLOCATION and (
|
| 1176 |
+
self._type_fdist.N() / typ1_count > typ2_count / col_count
|
| 1177 |
+
):
|
| 1178 |
+
yield (typ1, typ2), log_likelihood
|
| 1179 |
+
|
| 1180 |
+
# ////////////////////////////////////////////////////////////
|
| 1181 |
+
# { Sentence-Starter Finder
|
| 1182 |
+
# ////////////////////////////////////////////////////////////
|
| 1183 |
+
|
| 1184 |
+
def _is_potential_sent_starter(self, cur_tok, prev_tok):
|
| 1185 |
+
"""
|
| 1186 |
+
Returns True given a token and the token that precedes it if it
|
| 1187 |
+
seems clear that the token is beginning a sentence.
|
| 1188 |
+
"""
|
| 1189 |
+
# If a token (i) is preceded by a sentece break that is
|
| 1190 |
+
# not a potential ordinal number or initial, and (ii) is
|
| 1191 |
+
# alphabetic, then it is a a sentence-starter.
|
| 1192 |
+
return (
|
| 1193 |
+
prev_tok.sentbreak
|
| 1194 |
+
and not (prev_tok.is_number or prev_tok.is_initial)
|
| 1195 |
+
and cur_tok.is_alpha
|
| 1196 |
+
)
|
| 1197 |
+
|
| 1198 |
+
def _find_sent_starters(self):
|
| 1199 |
+
"""
|
| 1200 |
+
Uses collocation heuristics for each candidate token to
|
| 1201 |
+
determine if it frequently starts sentences.
|
| 1202 |
+
"""
|
| 1203 |
+
for typ in self._sent_starter_fdist:
|
| 1204 |
+
if not typ:
|
| 1205 |
+
continue
|
| 1206 |
+
|
| 1207 |
+
typ_at_break_count = self._sent_starter_fdist[typ]
|
| 1208 |
+
typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."]
|
| 1209 |
+
if typ_count < typ_at_break_count:
|
| 1210 |
+
# needed after freq_threshold
|
| 1211 |
+
continue
|
| 1212 |
+
|
| 1213 |
+
log_likelihood = self._col_log_likelihood(
|
| 1214 |
+
self._sentbreak_count,
|
| 1215 |
+
typ_count,
|
| 1216 |
+
typ_at_break_count,
|
| 1217 |
+
self._type_fdist.N(),
|
| 1218 |
+
)
|
| 1219 |
+
|
| 1220 |
+
if (
|
| 1221 |
+
log_likelihood >= self.SENT_STARTER
|
| 1222 |
+
and self._type_fdist.N() / self._sentbreak_count
|
| 1223 |
+
> typ_count / typ_at_break_count
|
| 1224 |
+
):
|
| 1225 |
+
yield typ, log_likelihood
|
| 1226 |
+
|
| 1227 |
+
def _get_sentbreak_count(self, tokens):
|
| 1228 |
+
"""
|
| 1229 |
+
Returns the number of sentence breaks marked in a given set of
|
| 1230 |
+
augmented tokens.
|
| 1231 |
+
"""
|
| 1232 |
+
return sum(1 for aug_tok in tokens if aug_tok.sentbreak)
|
| 1233 |
+
|
| 1234 |
+
|
| 1235 |
+
######################################################################
|
| 1236 |
+
# { Punkt Sentence Tokenizer
|
| 1237 |
+
######################################################################
|
| 1238 |
+
|
| 1239 |
+
|
| 1240 |
+
class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
|
| 1241 |
+
"""
|
| 1242 |
+
A sentence tokenizer which uses an unsupervised algorithm to build
|
| 1243 |
+
a model for abbreviation words, collocations, and words that start
|
| 1244 |
+
sentences; and then uses that model to find sentence boundaries.
|
| 1245 |
+
This approach has been shown to work well for many European
|
| 1246 |
+
languages.
|
| 1247 |
+
"""
|
| 1248 |
+
|
| 1249 |
+
def __init__(
|
| 1250 |
+
self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken
|
| 1251 |
+
):
|
| 1252 |
+
"""
|
| 1253 |
+
train_text can either be the sole training text for this sentence
|
| 1254 |
+
boundary detector, or can be a PunktParameters object.
|
| 1255 |
+
"""
|
| 1256 |
+
PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls)
|
| 1257 |
+
|
| 1258 |
+
if train_text:
|
| 1259 |
+
self._params = self.train(train_text, verbose)
|
| 1260 |
+
|
| 1261 |
+
def train(self, train_text, verbose=False):
|
| 1262 |
+
"""
|
| 1263 |
+
Derives parameters from a given training text, or uses the parameters
|
| 1264 |
+
given. Repeated calls to this method destroy previous parameters. For
|
| 1265 |
+
incremental training, instantiate a separate PunktTrainer instance.
|
| 1266 |
+
"""
|
| 1267 |
+
if not isinstance(train_text, str):
|
| 1268 |
+
return train_text
|
| 1269 |
+
return PunktTrainer(
|
| 1270 |
+
train_text, lang_vars=self._lang_vars, token_cls=self._Token
|
| 1271 |
+
).get_params()
|
| 1272 |
+
|
| 1273 |
+
# ////////////////////////////////////////////////////////////
|
| 1274 |
+
# { Tokenization
|
| 1275 |
+
# ////////////////////////////////////////////////////////////
|
| 1276 |
+
|
| 1277 |
+
def tokenize(self, text: str, realign_boundaries: bool = True) -> List[str]:
|
| 1278 |
+
"""
|
| 1279 |
+
Given a text, returns a list of the sentences in that text.
|
| 1280 |
+
"""
|
| 1281 |
+
return list(self.sentences_from_text(text, realign_boundaries))
|
| 1282 |
+
|
| 1283 |
+
def debug_decisions(self, text: str) -> Iterator[Dict[str, Any]]:
|
| 1284 |
+
"""
|
| 1285 |
+
Classifies candidate periods as sentence breaks, yielding a dict for
|
| 1286 |
+
each that may be used to understand why the decision was made.
|
| 1287 |
+
|
| 1288 |
+
See format_debug_decision() to help make this output readable.
|
| 1289 |
+
"""
|
| 1290 |
+
|
| 1291 |
+
for match, decision_text in self._match_potential_end_contexts(text):
|
| 1292 |
+
tokens = self._tokenize_words(decision_text)
|
| 1293 |
+
tokens = list(self._annotate_first_pass(tokens))
|
| 1294 |
+
while tokens and not tokens[0].tok.endswith(self._lang_vars.sent_end_chars):
|
| 1295 |
+
tokens.pop(0)
|
| 1296 |
+
yield {
|
| 1297 |
+
"period_index": match.end() - 1,
|
| 1298 |
+
"text": decision_text,
|
| 1299 |
+
"type1": tokens[0].type,
|
| 1300 |
+
"type2": tokens[1].type,
|
| 1301 |
+
"type1_in_abbrs": bool(tokens[0].abbr),
|
| 1302 |
+
"type1_is_initial": bool(tokens[0].is_initial),
|
| 1303 |
+
"type2_is_sent_starter": tokens[1].type_no_sentperiod
|
| 1304 |
+
in self._params.sent_starters,
|
| 1305 |
+
"type2_ortho_heuristic": self._ortho_heuristic(tokens[1]),
|
| 1306 |
+
"type2_ortho_contexts": set(
|
| 1307 |
+
self._params._debug_ortho_context(tokens[1].type_no_sentperiod)
|
| 1308 |
+
),
|
| 1309 |
+
"collocation": (
|
| 1310 |
+
tokens[0].type_no_sentperiod,
|
| 1311 |
+
tokens[1].type_no_sentperiod,
|
| 1312 |
+
)
|
| 1313 |
+
in self._params.collocations,
|
| 1314 |
+
"reason": self._second_pass_annotation(tokens[0], tokens[1])
|
| 1315 |
+
or REASON_DEFAULT_DECISION,
|
| 1316 |
+
"break_decision": tokens[0].sentbreak,
|
| 1317 |
+
}
|
| 1318 |
+
|
| 1319 |
+
def span_tokenize(
|
| 1320 |
+
self, text: str, realign_boundaries: bool = True
|
| 1321 |
+
) -> Iterator[Tuple[int, int]]:
|
| 1322 |
+
"""
|
| 1323 |
+
Given a text, generates (start, end) spans of sentences
|
| 1324 |
+
in the text.
|
| 1325 |
+
"""
|
| 1326 |
+
slices = self._slices_from_text(text)
|
| 1327 |
+
if realign_boundaries:
|
| 1328 |
+
slices = self._realign_boundaries(text, slices)
|
| 1329 |
+
for sentence in slices:
|
| 1330 |
+
yield (sentence.start, sentence.stop)
|
| 1331 |
+
|
| 1332 |
+
def sentences_from_text(
|
| 1333 |
+
self, text: str, realign_boundaries: bool = True
|
| 1334 |
+
) -> List[str]:
|
| 1335 |
+
"""
|
| 1336 |
+
Given a text, generates the sentences in that text by only
|
| 1337 |
+
testing candidate sentence breaks. If realign_boundaries is
|
| 1338 |
+
True, includes in the sentence closing punctuation that
|
| 1339 |
+
follows the period.
|
| 1340 |
+
"""
|
| 1341 |
+
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
|
| 1342 |
+
|
| 1343 |
+
def _get_last_whitespace_index(self, text: str) -> int:
|
| 1344 |
+
"""
|
| 1345 |
+
Given a text, find the index of the *last* occurrence of *any*
|
| 1346 |
+
whitespace character, i.e. " ", "\n", "\t", "\r", etc.
|
| 1347 |
+
If none is found, return 0.
|
| 1348 |
+
"""
|
| 1349 |
+
for i in range(len(text) - 1, -1, -1):
|
| 1350 |
+
if text[i] in string.whitespace:
|
| 1351 |
+
return i
|
| 1352 |
+
return 0
|
| 1353 |
+
|
| 1354 |
+
def _match_potential_end_contexts(self, text: str) -> Iterator[Tuple[Match, str]]:
|
| 1355 |
+
"""
|
| 1356 |
+
Given a text, find the matches of potential sentence breaks,
|
| 1357 |
+
alongside the contexts surrounding these sentence breaks.
|
| 1358 |
+
|
| 1359 |
+
Since the fix for the ReDOS discovered in issue #2866, we no longer match
|
| 1360 |
+
the word before a potential end of sentence token. Instead, we use a separate
|
| 1361 |
+
regex for this. As a consequence, `finditer`'s desire to find non-overlapping
|
| 1362 |
+
matches no longer aids us in finding the single longest match.
|
| 1363 |
+
Where previously, we could use::
|
| 1364 |
+
|
| 1365 |
+
>>> pst = PunktSentenceTokenizer()
|
| 1366 |
+
>>> text = "Very bad acting!!! I promise."
|
| 1367 |
+
>>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP
|
| 1368 |
+
[<re.Match object; span=(9, 18), match='acting!!!'>]
|
| 1369 |
+
|
| 1370 |
+
Now we have to find the word before (i.e. 'acting') separately, and `finditer`
|
| 1371 |
+
returns::
|
| 1372 |
+
|
| 1373 |
+
>>> pst = PunktSentenceTokenizer()
|
| 1374 |
+
>>> text = "Very bad acting!!! I promise."
|
| 1375 |
+
>>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE
|
| 1376 |
+
[<re.Match object; span=(15, 16), match='!'>,
|
| 1377 |
+
<re.Match object; span=(16, 17), match='!'>,
|
| 1378 |
+
<re.Match object; span=(17, 18), match='!'>]
|
| 1379 |
+
|
| 1380 |
+
So, we need to find the word before the match from right to left, and then manually remove
|
| 1381 |
+
the overlaps. That is what this method does::
|
| 1382 |
+
|
| 1383 |
+
>>> pst = PunktSentenceTokenizer()
|
| 1384 |
+
>>> text = "Very bad acting!!! I promise."
|
| 1385 |
+
>>> list(pst._match_potential_end_contexts(text))
|
| 1386 |
+
[(<re.Match object; span=(17, 18), match='!'>, 'acting!!! I')]
|
| 1387 |
+
|
| 1388 |
+
:param text: String of one or more sentences
|
| 1389 |
+
:type text: str
|
| 1390 |
+
:return: Generator of match-context tuples.
|
| 1391 |
+
:rtype: Iterator[Tuple[Match, str]]
|
| 1392 |
+
"""
|
| 1393 |
+
previous_slice = slice(0, 0)
|
| 1394 |
+
previous_match = None
|
| 1395 |
+
for match in self._lang_vars.period_context_re().finditer(text):
|
| 1396 |
+
|
| 1397 |
+
# Get the slice of the previous word
|
| 1398 |
+
before_text = text[previous_slice.stop : match.start()]
|
| 1399 |
+
index_after_last_space = self._get_last_whitespace_index(before_text)
|
| 1400 |
+
if index_after_last_space:
|
| 1401 |
+
# + 1 to exclude the space itself
|
| 1402 |
+
index_after_last_space += previous_slice.stop + 1
|
| 1403 |
+
else:
|
| 1404 |
+
index_after_last_space = previous_slice.start
|
| 1405 |
+
prev_word_slice = slice(index_after_last_space, match.start())
|
| 1406 |
+
|
| 1407 |
+
# If the previous slice does not overlap with this slice, then
|
| 1408 |
+
# we can yield the previous match and slice. If there is an overlap,
|
| 1409 |
+
# then we do not yield the previous match and slice.
|
| 1410 |
+
if previous_match and previous_slice.stop <= prev_word_slice.start:
|
| 1411 |
+
yield (
|
| 1412 |
+
previous_match,
|
| 1413 |
+
text[previous_slice]
|
| 1414 |
+
+ previous_match.group()
|
| 1415 |
+
+ previous_match.group("after_tok"),
|
| 1416 |
+
)
|
| 1417 |
+
previous_match = match
|
| 1418 |
+
previous_slice = prev_word_slice
|
| 1419 |
+
|
| 1420 |
+
# Yield the last match and context, if it exists
|
| 1421 |
+
if previous_match:
|
| 1422 |
+
yield (
|
| 1423 |
+
previous_match,
|
| 1424 |
+
text[previous_slice]
|
| 1425 |
+
+ previous_match.group()
|
| 1426 |
+
+ previous_match.group("after_tok"),
|
| 1427 |
+
)
|
| 1428 |
+
|
| 1429 |
+
def _slices_from_text(self, text: str) -> Iterator[slice]:
|
| 1430 |
+
last_break = 0
|
| 1431 |
+
for match, context in self._match_potential_end_contexts(text):
|
| 1432 |
+
if self.text_contains_sentbreak(context):
|
| 1433 |
+
yield slice(last_break, match.end())
|
| 1434 |
+
if match.group("next_tok"):
|
| 1435 |
+
# next sentence starts after whitespace
|
| 1436 |
+
last_break = match.start("next_tok")
|
| 1437 |
+
else:
|
| 1438 |
+
# next sentence starts at following punctuation
|
| 1439 |
+
last_break = match.end()
|
| 1440 |
+
# The last sentence should not contain trailing whitespace.
|
| 1441 |
+
yield slice(last_break, len(text.rstrip()))
|
| 1442 |
+
|
| 1443 |
+
def _realign_boundaries(
|
| 1444 |
+
self, text: str, slices: Iterator[slice]
|
| 1445 |
+
) -> Iterator[slice]:
|
| 1446 |
+
"""
|
| 1447 |
+
Attempts to realign punctuation that falls after the period but
|
| 1448 |
+
should otherwise be included in the same sentence.
|
| 1449 |
+
|
| 1450 |
+
For example: "(Sent1.) Sent2." will otherwise be split as::
|
| 1451 |
+
|
| 1452 |
+
["(Sent1.", ") Sent1."].
|
| 1453 |
+
|
| 1454 |
+
This method will produce::
|
| 1455 |
+
|
| 1456 |
+
["(Sent1.)", "Sent2."].
|
| 1457 |
+
"""
|
| 1458 |
+
realign = 0
|
| 1459 |
+
for sentence1, sentence2 in _pair_iter(slices):
|
| 1460 |
+
sentence1 = slice(sentence1.start + realign, sentence1.stop)
|
| 1461 |
+
if not sentence2:
|
| 1462 |
+
if text[sentence1]:
|
| 1463 |
+
yield sentence1
|
| 1464 |
+
continue
|
| 1465 |
+
|
| 1466 |
+
m = self._lang_vars.re_boundary_realignment.match(text[sentence2])
|
| 1467 |
+
if m:
|
| 1468 |
+
yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip()))
|
| 1469 |
+
realign = m.end()
|
| 1470 |
+
else:
|
| 1471 |
+
realign = 0
|
| 1472 |
+
if text[sentence1]:
|
| 1473 |
+
yield sentence1
|
| 1474 |
+
|
| 1475 |
+
def text_contains_sentbreak(self, text: str) -> bool:
|
| 1476 |
+
"""
|
| 1477 |
+
Returns True if the given text includes a sentence break.
|
| 1478 |
+
"""
|
| 1479 |
+
found = False # used to ignore last token
|
| 1480 |
+
for tok in self._annotate_tokens(self._tokenize_words(text)):
|
| 1481 |
+
if found:
|
| 1482 |
+
return True
|
| 1483 |
+
if tok.sentbreak:
|
| 1484 |
+
found = True
|
| 1485 |
+
return False
|
| 1486 |
+
|
| 1487 |
+
def sentences_from_text_legacy(self, text: str) -> Iterator[str]:
|
| 1488 |
+
"""
|
| 1489 |
+
Given a text, generates the sentences in that text. Annotates all
|
| 1490 |
+
tokens, rather than just those with possible sentence breaks. Should
|
| 1491 |
+
produce the same results as ``sentences_from_text``.
|
| 1492 |
+
"""
|
| 1493 |
+
tokens = self._annotate_tokens(self._tokenize_words(text))
|
| 1494 |
+
return self._build_sentence_list(text, tokens)
|
| 1495 |
+
|
| 1496 |
+
def sentences_from_tokens(
|
| 1497 |
+
self, tokens: Iterator[PunktToken]
|
| 1498 |
+
) -> Iterator[PunktToken]:
|
| 1499 |
+
"""
|
| 1500 |
+
Given a sequence of tokens, generates lists of tokens, each list
|
| 1501 |
+
corresponding to a sentence.
|
| 1502 |
+
"""
|
| 1503 |
+
tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens))
|
| 1504 |
+
sentence = []
|
| 1505 |
+
for aug_tok in tokens:
|
| 1506 |
+
sentence.append(aug_tok.tok)
|
| 1507 |
+
if aug_tok.sentbreak:
|
| 1508 |
+
yield sentence
|
| 1509 |
+
sentence = []
|
| 1510 |
+
if sentence:
|
| 1511 |
+
yield sentence
|
| 1512 |
+
|
| 1513 |
+
def _annotate_tokens(self, tokens: Iterator[PunktToken]) -> Iterator[PunktToken]:
|
| 1514 |
+
"""
|
| 1515 |
+
Given a set of tokens augmented with markers for line-start and
|
| 1516 |
+
paragraph-start, returns an iterator through those tokens with full
|
| 1517 |
+
annotation including predicted sentence breaks.
|
| 1518 |
+
"""
|
| 1519 |
+
# Make a preliminary pass through the document, marking likely
|
| 1520 |
+
# sentence breaks, abbreviations, and ellipsis tokens.
|
| 1521 |
+
tokens = self._annotate_first_pass(tokens)
|
| 1522 |
+
|
| 1523 |
+
# Make a second pass through the document, using token context
|
| 1524 |
+
# information to change our preliminary decisions about where
|
| 1525 |
+
# sentence breaks, abbreviations, and ellipsis occurs.
|
| 1526 |
+
tokens = self._annotate_second_pass(tokens)
|
| 1527 |
+
|
| 1528 |
+
## [XX] TESTING
|
| 1529 |
+
# tokens = list(tokens)
|
| 1530 |
+
# self.dump(tokens)
|
| 1531 |
+
|
| 1532 |
+
return tokens
|
| 1533 |
+
|
| 1534 |
+
def _build_sentence_list(
|
| 1535 |
+
self, text: str, tokens: Iterator[PunktToken]
|
| 1536 |
+
) -> Iterator[str]:
|
| 1537 |
+
"""
|
| 1538 |
+
Given the original text and the list of augmented word tokens,
|
| 1539 |
+
construct and return a tokenized list of sentence strings.
|
| 1540 |
+
"""
|
| 1541 |
+
# Most of the work here is making sure that we put the right
|
| 1542 |
+
# pieces of whitespace back in all the right places.
|
| 1543 |
+
|
| 1544 |
+
# Our position in the source text, used to keep track of which
|
| 1545 |
+
# whitespace to add:
|
| 1546 |
+
pos = 0
|
| 1547 |
+
|
| 1548 |
+
# A regular expression that finds pieces of whitespace:
|
| 1549 |
+
white_space_regexp = re.compile(r"\s*")
|
| 1550 |
+
|
| 1551 |
+
sentence = ""
|
| 1552 |
+
for aug_tok in tokens:
|
| 1553 |
+
tok = aug_tok.tok
|
| 1554 |
+
|
| 1555 |
+
# Find the whitespace before this token, and update pos.
|
| 1556 |
+
white_space = white_space_regexp.match(text, pos).group()
|
| 1557 |
+
pos += len(white_space)
|
| 1558 |
+
|
| 1559 |
+
# Some of the rules used by the punkt word tokenizer
|
| 1560 |
+
# strip whitespace out of the text, resulting in tokens
|
| 1561 |
+
# that contain whitespace in the source text. If our
|
| 1562 |
+
# token doesn't match, see if adding whitespace helps.
|
| 1563 |
+
# If so, then use the version with whitespace.
|
| 1564 |
+
if text[pos : pos + len(tok)] != tok:
|
| 1565 |
+
pat = r"\s*".join(re.escape(c) for c in tok)
|
| 1566 |
+
m = re.compile(pat).match(text, pos)
|
| 1567 |
+
if m:
|
| 1568 |
+
tok = m.group()
|
| 1569 |
+
|
| 1570 |
+
# Move our position pointer to the end of the token.
|
| 1571 |
+
assert text[pos : pos + len(tok)] == tok
|
| 1572 |
+
pos += len(tok)
|
| 1573 |
+
|
| 1574 |
+
# Add this token. If it's not at the beginning of the
|
| 1575 |
+
# sentence, then include any whitespace that separated it
|
| 1576 |
+
# from the previous token.
|
| 1577 |
+
if sentence:
|
| 1578 |
+
sentence += white_space
|
| 1579 |
+
sentence += tok
|
| 1580 |
+
|
| 1581 |
+
# If we're at a sentence break, then start a new sentence.
|
| 1582 |
+
if aug_tok.sentbreak:
|
| 1583 |
+
yield sentence
|
| 1584 |
+
sentence = ""
|
| 1585 |
+
|
| 1586 |
+
# If the last sentence is empty, discard it.
|
| 1587 |
+
if sentence:
|
| 1588 |
+
yield sentence
|
| 1589 |
+
|
| 1590 |
+
# [XX] TESTING
|
| 1591 |
+
def dump(self, tokens: Iterator[PunktToken]) -> None:
|
| 1592 |
+
print("writing to /tmp/punkt.new...")
|
| 1593 |
+
with open("/tmp/punkt.new", "w") as outfile:
|
| 1594 |
+
for aug_tok in tokens:
|
| 1595 |
+
if aug_tok.parastart:
|
| 1596 |
+
outfile.write("\n\n")
|
| 1597 |
+
elif aug_tok.linestart:
|
| 1598 |
+
outfile.write("\n")
|
| 1599 |
+
else:
|
| 1600 |
+
outfile.write(" ")
|
| 1601 |
+
|
| 1602 |
+
outfile.write(str(aug_tok))
|
| 1603 |
+
|
| 1604 |
+
# ////////////////////////////////////////////////////////////
|
| 1605 |
+
# { Customization Variables
|
| 1606 |
+
# ////////////////////////////////////////////////////////////
|
| 1607 |
+
|
| 1608 |
+
PUNCTUATION = tuple(";:,.!?")
|
| 1609 |
+
|
| 1610 |
+
# ////////////////////////////////////////////////////////////
|
| 1611 |
+
# { Annotation Procedures
|
| 1612 |
+
# ////////////////////////////////////////////////////////////
|
| 1613 |
+
|
| 1614 |
+
def _annotate_second_pass(
|
| 1615 |
+
self, tokens: Iterator[PunktToken]
|
| 1616 |
+
) -> Iterator[PunktToken]:
|
| 1617 |
+
"""
|
| 1618 |
+
Performs a token-based classification (section 4) over the given
|
| 1619 |
+
tokens, making use of the orthographic heuristic (4.1.1), collocation
|
| 1620 |
+
heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
|
| 1621 |
+
"""
|
| 1622 |
+
for token1, token2 in _pair_iter(tokens):
|
| 1623 |
+
self._second_pass_annotation(token1, token2)
|
| 1624 |
+
yield token1
|
| 1625 |
+
|
| 1626 |
+
def _second_pass_annotation(
|
| 1627 |
+
self, aug_tok1: PunktToken, aug_tok2: Optional[PunktToken]
|
| 1628 |
+
) -> Optional[str]:
|
| 1629 |
+
"""
|
| 1630 |
+
Performs token-based classification over a pair of contiguous tokens
|
| 1631 |
+
updating the first.
|
| 1632 |
+
"""
|
| 1633 |
+
# Is it the last token? We can't do anything then.
|
| 1634 |
+
if not aug_tok2:
|
| 1635 |
+
return
|
| 1636 |
+
|
| 1637 |
+
if not aug_tok1.period_final:
|
| 1638 |
+
# We only care about words ending in periods.
|
| 1639 |
+
return
|
| 1640 |
+
typ = aug_tok1.type_no_period
|
| 1641 |
+
next_typ = aug_tok2.type_no_sentperiod
|
| 1642 |
+
tok_is_initial = aug_tok1.is_initial
|
| 1643 |
+
|
| 1644 |
+
# [4.1.2. Collocation Heuristic] If there's a
|
| 1645 |
+
# collocation between the word before and after the
|
| 1646 |
+
# period, then label tok as an abbreviation and NOT
|
| 1647 |
+
# a sentence break. Note that collocations with
|
| 1648 |
+
# frequent sentence starters as their second word are
|
| 1649 |
+
# excluded in training.
|
| 1650 |
+
if (typ, next_typ) in self._params.collocations:
|
| 1651 |
+
aug_tok1.sentbreak = False
|
| 1652 |
+
aug_tok1.abbr = True
|
| 1653 |
+
return REASON_KNOWN_COLLOCATION
|
| 1654 |
+
|
| 1655 |
+
# [4.2. Token-Based Reclassification of Abbreviations] If
|
| 1656 |
+
# the token is an abbreviation or an ellipsis, then decide
|
| 1657 |
+
# whether we should *also* classify it as a sentbreak.
|
| 1658 |
+
if (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial):
|
| 1659 |
+
# [4.1.1. Orthographic Heuristic] Check if there's
|
| 1660 |
+
# orthogrpahic evidence about whether the next word
|
| 1661 |
+
# starts a sentence or not.
|
| 1662 |
+
is_sent_starter = self._ortho_heuristic(aug_tok2)
|
| 1663 |
+
if is_sent_starter == True:
|
| 1664 |
+
aug_tok1.sentbreak = True
|
| 1665 |
+
return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC
|
| 1666 |
+
|
| 1667 |
+
# [4.1.3. Frequent Sentence Starter Heruistic] If the
|
| 1668 |
+
# next word is capitalized, and is a member of the
|
| 1669 |
+
# frequent-sentence-starters list, then label tok as a
|
| 1670 |
+
# sentence break.
|
| 1671 |
+
if aug_tok2.first_upper and next_typ in self._params.sent_starters:
|
| 1672 |
+
aug_tok1.sentbreak = True
|
| 1673 |
+
return REASON_ABBR_WITH_SENTENCE_STARTER
|
| 1674 |
+
|
| 1675 |
+
# [4.3. Token-Based Detection of Initials and Ordinals]
|
| 1676 |
+
# Check if any initials or ordinals tokens that are marked
|
| 1677 |
+
# as sentbreaks should be reclassified as abbreviations.
|
| 1678 |
+
if tok_is_initial or typ == "##number##":
|
| 1679 |
+
|
| 1680 |
+
# [4.1.1. Orthographic Heuristic] Check if there's
|
| 1681 |
+
# orthogrpahic evidence about whether the next word
|
| 1682 |
+
# starts a sentence or not.
|
| 1683 |
+
is_sent_starter = self._ortho_heuristic(aug_tok2)
|
| 1684 |
+
|
| 1685 |
+
if is_sent_starter == False:
|
| 1686 |
+
aug_tok1.sentbreak = False
|
| 1687 |
+
aug_tok1.abbr = True
|
| 1688 |
+
if tok_is_initial:
|
| 1689 |
+
return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC
|
| 1690 |
+
return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
|
| 1691 |
+
|
| 1692 |
+
# Special heuristic for initials: if orthogrpahic
|
| 1693 |
+
# heuristic is unknown, and next word is always
|
| 1694 |
+
# capitalized, then mark as abbrev (eg: J. Bach).
|
| 1695 |
+
if (
|
| 1696 |
+
is_sent_starter == "unknown"
|
| 1697 |
+
and tok_is_initial
|
| 1698 |
+
and aug_tok2.first_upper
|
| 1699 |
+
and not (self._params.ortho_context[next_typ] & _ORTHO_LC)
|
| 1700 |
+
):
|
| 1701 |
+
aug_tok1.sentbreak = False
|
| 1702 |
+
aug_tok1.abbr = True
|
| 1703 |
+
return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC
|
| 1704 |
+
|
| 1705 |
+
return
|
| 1706 |
+
|
| 1707 |
+
def _ortho_heuristic(self, aug_tok: PunktToken) -> Union[bool, str]:
|
| 1708 |
+
"""
|
| 1709 |
+
Decide whether the given token is the first token in a sentence.
|
| 1710 |
+
"""
|
| 1711 |
+
# Sentences don't start with punctuation marks:
|
| 1712 |
+
if aug_tok.tok in self.PUNCTUATION:
|
| 1713 |
+
return False
|
| 1714 |
+
|
| 1715 |
+
ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod]
|
| 1716 |
+
|
| 1717 |
+
# If the word is capitalized, occurs at least once with a
|
| 1718 |
+
# lower case first letter, and never occurs with an upper case
|
| 1719 |
+
# first letter sentence-internally, then it's a sentence starter.
|
| 1720 |
+
if (
|
| 1721 |
+
aug_tok.first_upper
|
| 1722 |
+
and (ortho_context & _ORTHO_LC)
|
| 1723 |
+
and not (ortho_context & _ORTHO_MID_UC)
|
| 1724 |
+
):
|
| 1725 |
+
return True
|
| 1726 |
+
|
| 1727 |
+
# If the word is lower case, and either (a) we've seen it used
|
| 1728 |
+
# with upper case, or (b) we've never seen it used
|
| 1729 |
+
# sentence-initially with lower case, then it's not a sentence
|
| 1730 |
+
# starter.
|
| 1731 |
+
if aug_tok.first_lower and (
|
| 1732 |
+
(ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC)
|
| 1733 |
+
):
|
| 1734 |
+
return False
|
| 1735 |
+
|
| 1736 |
+
# Otherwise, we're not sure.
|
| 1737 |
+
return "unknown"
|
| 1738 |
+
|
| 1739 |
+
|
| 1740 |
+
DEBUG_DECISION_FMT = """Text: {text!r} (at offset {period_index})
|
| 1741 |
+
Sentence break? {break_decision} ({reason})
|
| 1742 |
+
Collocation? {collocation}
|
| 1743 |
+
{type1!r}:
|
| 1744 |
+
known abbreviation: {type1_in_abbrs}
|
| 1745 |
+
is initial: {type1_is_initial}
|
| 1746 |
+
{type2!r}:
|
| 1747 |
+
known sentence starter: {type2_is_sent_starter}
|
| 1748 |
+
orthographic heuristic suggests is a sentence starter? {type2_ortho_heuristic}
|
| 1749 |
+
orthographic contexts in training: {type2_ortho_contexts}
|
| 1750 |
+
"""
|
| 1751 |
+
|
| 1752 |
+
|
| 1753 |
+
def format_debug_decision(d):
|
| 1754 |
+
return DEBUG_DECISION_FMT.format(**d)
|
| 1755 |
+
|
| 1756 |
+
|
| 1757 |
+
def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
|
| 1758 |
+
"""Builds a punkt model and applies it to the same text"""
|
| 1759 |
+
cleanup = (
|
| 1760 |
+
lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ")
|
| 1761 |
+
)
|
| 1762 |
+
trainer = train_cls()
|
| 1763 |
+
trainer.INCLUDE_ALL_COLLOCS = True
|
| 1764 |
+
trainer.train(text)
|
| 1765 |
+
sbd = tok_cls(trainer.get_params())
|
| 1766 |
+
for sentence in sbd.sentences_from_text(text):
|
| 1767 |
+
print(cleanup(sentence))
|