Spaces:

NCTCMumbai
/

NCTC

Runtime error

App Files Files Community

NCTC / models /research /cvt_text /preprocessing.py

NCTCMumbai

Upload 2571 files

0b8359d about 2 years ago

raw

history blame contribute delete

3.28 kB

	# Copyright 2018 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""
	Preprocesses pretrained word embeddings, creates dev sets for tasks without a
	provided one, and figures out the set of output classes for each task.
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import os
	import random

	from base import configure
	from base import embeddings
	from base import utils
	from task_specific.word_level import word_level_data


	def main(data_dir='./data'):
	random.seed(0)

	utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS")
	for pretrained in ['glove.6B.300d.txt']:
	config = configure.Config(data_dir=data_dir,
	for_preprocessing=True,
	pretrained_embeddings=pretrained,
	word_embedding_size=300)
	embeddings.PretrainedEmbeddingLoader(config).build()

	utils.log("CONSTRUCTING DEV SETS")
	for task_name in ["chunk"]:
	# chunking does not come with a provided dev split, so create one by
	# selecting a random subset of the data
	config = configure.Config(data_dir=data_dir,
	for_preprocessing=True)
	task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/'
	train_sentences = word_level_data.TaggedDataLoader(
	config, task_name, False).get_labeled_sentences("train")
	random.shuffle(train_sentences)
	write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:])
	write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500])

	utils.log("WRITING LABEL MAPPINGS")
	for task_name in ["chunk"]:
	for i, label_encoding in enumerate(["BIOES"]):
	config = configure.Config(data_dir=data_dir,
	for_preprocessing=True,
	label_encoding=label_encoding)
	token_level = task_name in ["ccg", "pos", "depparse"]
	loader = word_level_data.TaggedDataLoader(config, task_name, token_level)
	if token_level:
	if i != 0:
	continue
	utils.log("WRITING LABEL MAPPING FOR", task_name.upper())
	else:
	utils.log(" Writing label mapping for", task_name.upper(),
	label_encoding)
	utils.log(" ", len(loader.label_mapping), "classes")
	utils.write_cpickle(loader.label_mapping,
	loader.label_mapping_path)


	def write_sentences(fname, sentences):
	with open(fname, 'w') as f:
	for words, tags in sentences:
	for word, tag in zip(words, tags):
	f.write(word + " " + tag + "\n")
	f.write("\n")


	if __name__ == '__main__':
	main()