ViEduQA

Running

ViEduQA / plms /spacy_module.py

Truong-Phuc Nguyen

Upload 6 files

c8a708c verified 11 months ago

2.72 kB

	import spacy

	__all__ = 'SpacyPipeline'

	MODELS = {
	"en": "en_core_web_sm",
	"ja": "ja_core_news_sm",
	"zh": "zh_core_web_sm",
	"de": "de_core_news_sm",
	"es": "es_core_news_sm",
	"it": "it_core_news_sm",
	"ko": "ko_core_news_sm",
	"ru": "ru_core_news_sm",
	"fr": "fr_core_news_sm",
	"vi": "vi_core_news_lg"
	}
	VALID_METHODS = ['positionrank', 'textrank', 'biasedtextrank', 'positionrank', 'ner']


	class SpacyPipeline:

	def __init__(self, language, algorithm: str = None):
	model = "vi_core_news_lg" if language not in MODELS else MODELS[language]

	self.nlp = spacy.load(model)
	self.nlp.add_pipe("sentencizer")
	self.algorithm = algorithm
	self.library = None
	if self.algorithm is not None and self.algorithm != 'ner':
	assert algorithm in VALID_METHODS, f'invalid algorithm {algorithm}\n- valid list: {VALID_METHODS}'
	if self.algorithm == 'yake':
	import spacy_ke # need to load yake
	self.nlp.add_pipe("yake")
	self.library = 'spacy_ke'
	elif self.algorithm in ['textrank', 'biasedtextrank', 'positionrank']:
	import pytextrank
	self.nlp.add_pipe(algorithm)
	self.library = 'pytextrank'
	else:
	raise ValueError(f'unknown algorithm: {self.algorithm}')

	def _get_keyword(self, output, original_document=None, n=None):
	if self.algorithm == 'ner':
	return [str(i) for i in output.ents]
	assert original_document is not None
	assert n is not None
	if self.library == 'spacy_ke':
	return [str(term) for term, score in output._.extract_keywords(n) if str(term) in original_document]
	return [str(i.text) for i in output._.phrases[:n] if str(i.text) in original_document]

	def sentence_keyword(self, string: str, n: int = 10):
	out = self.nlp(string)
	sentence = [str(i) for i in out.sents if len(i) > 0]
	keyword = self._get_keyword(out, string, n)
	return sentence, keyword

	def sentence(self, string: str):
	return [str(i) for i in self.nlp(string).sents if len(i) > 0]

	def token(self, string: str):
	return [str(i) for i in self.nlp.tokenizer(string)]

	def keyword(self, string: str, n: int = 10):
	return self._get_keyword(self.nlp(string), string, n)

	def ner(self, string: str, n: int = None):
	keywords = self.nlp(string).ents
	return keywords[:min(len(keywords), n)] if n is not None else keywords

	@property
	def language(self):
	return self.nlp.lang