File size: 2,079 Bytes
81446ef
 
 
bce5fb4
 
81446ef
 
 
8b26782
81446ef
 
 
7057a87
 
cfbeebd
81446ef
 
ae44a4c
7057a87
4570d11
 
b9c3b16
81446ef
eace371
81446ef
b9c3b16
81446ef
b9c3b16
 
 
 
 
 
81446ef
b9c3b16
 
 
 
7057a87
 
 
016223f
81446ef
b9b0b35
 
016223f
7057a87
016223f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import streamlit as st
from keybert import KeyBERT
import yake
from keyphrase_vectorizers import KeyphraseCountVectorizer

@st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=True)
def load_model():
  model = KeyBERT("google/bigbird-pegasus-large-bigpatent")
  return model
  
model = load_model()


st.title("Patent Text Extractor")
placeholder = st.empty()
text_input = placeholder.text_area("Paste or write text", height=300)
button = st.button("Extract Keywords")
#top_n = st.sidebar.slider("Select a number of keywords", 1, 10, 50,20)
#min_ngram = st.sidebar.number_input("Minimum number of words in each keyword", 1)
#max_ngram = st.sidebar.number_input("Maximum number of words in each keyword", 3)
#st.sidebar.code(f"ngram_range=({min_ngram}, {max_ngram})")

#params = {"docs": text_input, "top_n": top_n, "stop_words": 'english',"vectorizer":KeyphraseCountVectorizer()}

#add_diversity = st.sidebar.checkbox("Adjust diversity of keywords")

#if add_diversity:
  #method = st.sidebar.selectbox("Select a method", ("Max Sum Similarity", "Maximal Marginal Relevance"))
  #if method == "Max Sum Similarity":
        #nr_candidates = st.sidebar.slider("nr_candidates", 20, 50, 20, 2)
        #params["use_maxsum"] = True
        #params["nr_candidates"] = nr_candidates

  #elif method == "Maximal Marginal Relevance":
        #diversity = st.sidebar.slider("diversity", 0.1, 1.0, 0.6, 0.01)
        #params["use_mmr"] = True
        #params["diversity"] = diversity
kw_extractor = yake.KeywordExtractor(top=50)
candidates = kw_extractor.extract_keywords(text_input)
keyphrases = [candidate[0] for candidate in candidates]
#kw_model = KeyBERT(model=model)

#if keywords != []:
st.info("Extracted keywords")
keywords = model.extract_keywords(text_input,candidates, keyphrase_ngram_range=(1, 3), 
                       top_n=50,stop_words='english',vectorizer=KeyphraseCountVectorizer())
if keywords != []:
    st.info("Extracted keywords")
    keywords = pd.DataFrame(keywords, columns=["Keyword", "Score"])
    st.table(keywords)