NLP Flashcards

1
Q

How to create a Doc object in Spacy?

A

import spacy
nlp = spacy.load(‘en_core_web_sm’)
doc = nlp(u’This is some text’)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
2
Q

What is a span in Spacy?

A

span is a slice of Doc object, Doc[start:end]

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
3
Q

What are noun_chunks in Spacy?

A

base noun phrases

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
4
Q

How to visualize in Spacy?

A

from spacy import displacy

displacy. render(doc, style=’dep|ent’, jupyter=True, options={‘distance’: 110})
displacy. serve(doc, style=’dep’)
127. 0.0.1:port

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
5
Q

How to get a list of stopwords in Spacy?

A

import spacy
nlp = spacy.load(‘en_core_web_sm’)
print(nlp.Defaults.stop_words)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
6
Q

How to check if a word is a stop word in Spacy?

A

nlp.vocab[‘word’].is_stop

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
7
Q

How to add a stop word in Spacy?

A

nlp. Defaults.stop_words.add(‘btw’)

nlp. vocab[‘btw’].is_stop = True

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
8
Q

How to remove a stop word in Spacy?

A

nlp. Defaults.stop_words.remove(‘btw’)

nlp. vocab[‘btw’].is_stop = False

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
9
Q

How to build a library of token patterns in Spacy?

A

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern1 = [{‘LOWER’: ‘solarpower’}]
pattern2 = [{‘LOWER’: ‘solar’}, {‘IS_PUNCT’: True, ‘OP’:’*’}, {‘LOWER’: ‘power’}]
matcher.add(‘SolarPower’, None, pattern1, pattern2)
found_matches = matcher(doc)
print(found_matches)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
10
Q

How to use a matcher for terminology lists in Spacy?

A

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = [‘voodoo economics’, ‘supply-side economics’, ‘trickle-down economics’, ‘free-market economics’]
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add(‘VoodooEconomics’, None, *phrase_patterns)
matches = matcher(doc)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
11
Q

How to count POS frequency in a text in Spacy?

A

POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
print(f’{k}. {doc.vocab[k].text:{5}}: {v}’)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

How to add a named entity in Spacy?

A

from spacy.tokens import Span
ORG = doc.vocab.strings[u’ORG’]
new_ent = Span(doc, start, end, label=ORG)
doc.ents = list(doc.ents) + [new_ent]

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
13
Q

How to add named entities to all matching spans in Spacy?

A

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = [‘vacuum cleaner’, ‘vacuum-cleaner’]
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add(‘newproduct’, None, *phrase_patterns)
matches = matcher(doc)
from spacy.tokens import Span
PROD = doc.vocab.strings[u’PRODUCT’]
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]
doc.ents = list(doc.ents) + new_ents

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
14
Q

How to add a new rule to pipeline in Spacy?

A
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before=’parser’)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
15
Q

How to change segmentation rules in Spacy?

A

from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens

sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
16
Q

TF-IDF

A

term frequency * (1/document frequency)

17
Q

How to extract text features using scikit-learn?

A

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train)

18
Q

How to make a prediction for a new text using classification model in scikit-learn?

A

text_clf.predict([‘some text here’])

19
Q

How to clear dataset from missing data in scikit-learn?

A

df.dropna(inplace=True)

20
Q

How to clear data set from empty strings in scikit-learn?

A

blanks = []

for i, lb, rv in df.itertuples():
if rv.isspace():
blanks.append(i)
df.drop(blanks,inplace=True)

21
Q

How does Word2vec train words agains words in a corpus?

A
  • Using context to predict a target word (continuous bag of words)
  • Using a word to predict a target context (skip-gram)
22
Q

check word similarity in Spacy

A

tokens = nlp(u’fox dog animal’)

for token1 in tokens:
for token2 in tokens:
print(token1.text, token2.text, token1.similarity(token2))

23
Q

vector arithmetic in Spacy (finding similar words using vectors)

A

from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
# Ignore words without vectors and mixed-case words:
if word.has_vector:
if word.is_lower:
if word.is_alpha:
similarity = cosine_similarity(new_vector, word.vector)
computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

24
Q

How to do sentiment analysis using NLTK?

A

import nltk
nltk.download(‘vader_lexicon’)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
import pandas as pd
df = pd.read_csv(‘../TextFiles/amazonreviews.tsv’, sep=’\t’)
df.head()
df[‘scores’] = df[‘review’].apply(lambda review: sid.polarity_scores(review))
df[‘compound’] = df[‘scores’].apply(lambda d: d[‘compound’])
df[‘score’] = df[‘compound’].apply(lambda s: ‘pos’ if s>= 0 else ‘neg’)

25
How to use Latent Dirichlet Allocation for topic modelling?
import pandas as pd npr = pd.read_csv('npr.csv') from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english') dtf = cv.fit_transform(npr['Article']) from sklearn.decomposition import LatentDirichletAllocation LDA = LatentDirichletAllocation(n_components=7, random_state=42) LDA.fit(dtf) single_topic = LDA.components_[0] top_ten_words = single_topic.argsort()[-10:] for i in top_ten_words: print(cv.get_feature_names()[i]) for index,topic in enumerate(LDA.components_): print(f'THE TOP 15 WORDS FOR TOPIC #{index}') print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]]) print('\n') topic_results = LDA.transform(dtf) npr['Topic'] = topic_results.argmax(axis=1)
26
How to use non-negative matrix vectorization for topic modelling?
import pandas as pd npr = pd.read_csv('npr.csv') from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') dtf = tfidf.fit_transform(npr['Article']) from sklearn.decomposition import NMF nmf_model = NMF(n_components=7,random_state=42) nmf_model.fit(dtf) for index, topic in enumerate(nmf_model.components_): print(f'Top 15 words for topic # {index}') print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]]) print('\n') topic_results = nmf_model.transform(dtf) npr['Topic'] = topic_results.argmax(axis=1)
27
text generation using Keras
``` def read_file(filepath): with open(filepath) as f: file_text = f.read() ``` return file_text import spacy nlp = spacy.load('en',disable=['parser','tagger','ner']) nlp_max_length = 1198623 ``` def separate_punct(doc_text): return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n '] ``` d = read_file('moby_dick_four_chapters.txt') tokens = separate_punct(d) ``` train_len = 24 + 1 text_sequences = [] for i in range(train_len, len(tokens)): seq = tokens[i-train_len:i] text_sequences.append(seq) ``` from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(text_sequences) sequences = tokenizer.texts_to_sequences(text_sequences) vocabulary_size = len(tokenizer.word_counts) import numpy as np sequences = np.array(sequences) from keras.utils import to_categorical ``` X = sequences[:,:-1] y = sequences[:,-1] ``` ``` y = to_categorical(y,num_classes=vocabulary_size+1) seq_len = X.shape[1] ``` from keras.models import Sequential from keras.layers import Dense, LSTM, Embedding ``` def create_model(vocab_size, seq_len): model = Sequential() model.add(Embedding(vocab_size, seq_len, input_length=seq_len)) model.add(LSTM(50,return_sequences=True)) model.add(LSTM(50)) model.add(Dense(50, activation='relu')) model.add(Dense(vocab_size,activation='softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) model.summary() return model ``` model = create_model(vocabulary_size+1,seq_len) from pickle import dump, load model. fit(X,y,batch_size=128,epochs=50,verbose=1) model. save('mobydick_model.h5') dump(tokenizer,open('mysimpletokenzr','wb')) from keras.preprocessing.sequence import pad_sequences def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words): ''' INPUTS: model : model that was trained on text data tokenizer : tokenizer that was fit on text data seq_len : length of training sequence seed_text : raw string text to serve as the seed num_gen_words : number of words to be generated by model ''' ``` # Final Output output_text = [] ``` ``` # Intial Seed Sequence input_text = seed_text ``` ``` # Create num_gen_words for i in range(num_gen_words): ``` ``` # Take the input text string and encode it to a sequence encoded_text = tokenizer.texts_to_sequences([input_text])[0] ``` ``` # Pad sequences to our trained rate (50 words in the video) pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre') ``` ``` # Predict Class Probabilities for each word pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] ``` ``` # Grab word pred_word = tokenizer.index_word[pred_word_ind] ``` ``` # Update the sequence of input text (shifting one over with the new word) input_text += ' ' + pred_word ``` output_text.append(pred_word) ``` # Make it look like a sentence. return ' '.join(output_text) ``` import random random.seed(101) random_pick = random.randint(0, len(text_sequences)) random_seed_text = text_sequences[random_pick] seed_text = ' '.join(random_seed_text) generate_text(model, tokenizer, seq_len, seed_text, 25)
28
Building a Q&A chatbot with Keras
import pickle import numpy as np ``` with open('train_qa.txt', 'rb') as f: train_data = pickle.load(f) ``` ``` with open('test_qa.txt','rb') as f: test_data = pickle.load(f) ``` all_data = train_data + test_data vocab = set() for story, question, answer in all_data: vocab = vocab.union(set(story)) vocab = vocab.union(set(question)) vocab. add('no') vocab. add('yes') vocab_len = len(vocab) + 1 all_story_lens = [len(data[0]) for data in all_data] max_story_len = max(all_story_lens) max_question_len = max([len(data[1]) for data in all_data]) from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(filters=[]) tokenizer.fit_on_texts(vocab) def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len): X = [] Xq = [] Y = [] for story, query, answer in data: x = [word_index[word.lower()] for word in story] xq = [word_index[word.lower()] for word in query] y = np.zeros(len(word_index)+1) y[word_index[answer]] = 1 X.append(x) Xq.append(xq) Y.append(y) return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len), np.array(Y)) inputs_train, queries_train, answers_train = vectorize_stories(train_data) inputs_test, queries_test, answers_test = vectorize_stories(test_data) from keras.models import Sequential, Model from keras.layers.embeddings import Embedding from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM ``` input_sequence = Input((max_story_len,)) question = Input((max_question_len,)) ``` vocab_size = len(vocab) + 1 input_encoder_m = Sequential() input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64)) input_encoder_m.add(Dropout(0.3)) input_encoder_c = Sequential() input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len)) input_encoder_c.add(Dropout(0.3)) question_encoder = Sequential() question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64, input_length=max_question_len)) question_encoder.add(Dropout(0.3)) ``` input_encoded_m = input_encoder_m(input_sequence) input_encoded_c = input_encoder_c(input_sequence) question_encoded = question_encoder(question) ``` ``` match = dot([input_encoded_m,question_encoded],axes=(2,2)) match = Activation('softmax')(match) ``` ``` response = add([match,input_encoded_c]) response = Permute((2,1))(response) ``` answer = concatenate([response,question_encoded]) answer = LSTM(32)(answer) ``` answer = Dropout(0.5)(answer) answer = Dense(vocab_size)(answer) ``` answer = Activation('softmax')(answer) model = Model([input_sequence,question],answer) model. compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy']) model. summary() history = model.fit([inputs_train,queries_train],answers_train,batch_size=32,epochs=100,validation_data=([inputs_test,queries_test],answers_test)) pred_results = model.predict([inputs_test,queries_test]) my_story = 'John left the kitchen . Sandra dropped the football in the garden .' my_question = 'Is the football in the garden ?' mydata = [(my_story.split(),my_question.split(),'yes')] mystory, myques, myans = vectorize_stories(mydata) pred_results = model.predict([mystory, myques]) val_max = np.argmax(pred_results[0]) for key, val in tokenizer.word_index.items(): if val == val_max: k = key k
29
Building a corpus from individual files using Pandas and python.os
import numpy as np import pandas as pd import os row_list = [] for subdir in ['neg','pos']: for folder, subfolders, filenames in os.walk('../moviereviews/'+subdir): for file in filenames: d = {'label':subdir} # assign the name of the subdirectory to the label field with open('moviereviews/'+subdir+'/'+file) as f: if f.read(): # handles the case of empty files, which become NaN on import f.seek(0) d['review'] = f.read() # assign the contents of the file to the review field row_list.append(d) break df = pd.DataFrame(row_list)
30
how to efficiently process lots of text data in Spacy?
docs = list(nlp.pipe(LOTS_OF_TEXTS))
31
disabling pipeline components in Spacy
``` # Disable tagger and parser with nlp.disable_pipes("tagger", "parser"): # Process the text and print the entities doc = nlp(text) print(doc.ents) ```
32
custom extensions for docs, tokens and spans in Spacy
from spacy.tokens import Token ``` # Define getter function def get_is_color(token): colors = ["red", "yellow", "blue"] return token.text in colors ``` ``` # Set extension on the Token with getter Token.set_extension("is_color", getter=get_is_color) ``` doc = nlp("The sky is blue.") print(doc[3]._.is_color, "-", doc[3].text) from spacy.tokens import Span ``` # Define getter function def get_has_color(span): colors = ["red", "yellow", "blue"] return any(token.text in colors for token in span) ``` ``` # Set extension on the Span with getter Span.set_extension("has_color", getter=get_has_color) ``` doc = nlp("The sky is blue.") print(doc[1:4]._.has_color, "-", doc[1:4].text) print(doc[0:2]._.has_color, "-", doc[0:2].text) from spacy.tokens import Doc ``` # Define method with arguments def has_token(doc, token_text): in_doc = token_text in [token.text for token in doc] return in_doc ``` ``` # Set extension on the Doc with method Doc.set_extension("has_token", method=has_token) ``` doc = nlp("The sky is blue.") print(doc._.has_token("blue"), "- blue") print(doc._.has_token("cloud"), "- cloud")
33
how to update a statistical model in Spacy
TRAINING_DATA = [ ("How to preorder the iPhone X", {"entities": [(20, 28, "GADGET")]}) # And many more examples... ] # Loop for 10 iterations for i in range(10): # Shuffle the training data random.shuffle(TRAINING_DATA) # Create batches and iterate over them for batch in spacy.util.minibatch(TRAINING_DATA): # Split the batch in texts and annotations texts = [text for text, annotation in batch] annotations = [annotation for text, annotation in batch] # Update the model nlp.update(texts, annotations) ``` # Save the model nlp.to_disk(path_to_model) ```