NLP Flashcards

Question

How to use Latent Dirichlet Allocation for topic modelling?

Answer 1

import pandas as pd npr = pd.read_csv('npr.csv') from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english') dtf = cv.fit_transform(npr['Article']) from sklearn.decomposition import LatentDirichletAllocation LDA = LatentDirichletAllocation(n_components=7, random_state=42) LDA.fit(dtf) single_topic = LDA.components_[0] top_ten_words = single_topic.argsort()[-10:] for i in top_ten_words: print(cv.get_feature_names()[i]) for index,topic in enumerate(LDA.components_): print(f'THE TOP 15 WORDS FOR TOPIC #{index}') print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]]) print('\n') topic_results = LDA.transform(dtf) npr['Topic'] = topic_results.argmax(axis=1)

Answer 2

import pandas as pd npr = pd.read_csv('npr.csv') from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') dtf = tfidf.fit_transform(npr['Article']) from sklearn.decomposition import NMF nmf_model = NMF(n_components=7,random_state=42) nmf_model.fit(dtf) for index, topic in enumerate(nmf_model.components_): print(f'Top 15 words for topic # {index}') print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]]) print('\n') topic_results = nmf_model.transform(dtf) npr['Topic'] = topic_results.argmax(axis=1)

Answer 3

``` def read_file(filepath): with open(filepath) as f: file_text = f.read() ``` return file_text import spacy nlp = spacy.load('en',disable=['parser','tagger','ner']) nlp_max_length = 1198623 ``` def separate_punct(doc_text): return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n '] ``` d = read_file('moby_dick_four_chapters.txt') tokens = separate_punct(d) ``` train_len = 24 + 1 text_sequences = [] for i in range(train_len, len(tokens)): seq = tokens[i-train_len:i] text_sequences.append(seq) ``` from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(text_sequences) sequences = tokenizer.texts_to_sequences(text_sequences) vocabulary_size = len(tokenizer.word_counts) import numpy as np sequences = np.array(sequences) from keras.utils import to_categorical ``` X = sequences[:,:-1] y = sequences[:,-1] ``` ``` y = to_categorical(y,num_classes=vocabulary_size+1) seq_len = X.shape[1] ``` from keras.models import Sequential from keras.layers import Dense, LSTM, Embedding ``` def create_model(vocab_size, seq_len): model = Sequential() model.add(Embedding(vocab_size, seq_len, input_length=seq_len)) model.add(LSTM(50,return_sequences=True)) model.add(LSTM(50)) model.add(Dense(50, activation='relu')) model.add(Dense(vocab_size,activation='softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) model.summary() return model ``` model = create_model(vocabulary_size+1,seq_len) from pickle import dump, load model. fit(X,y,batch_size=128,epochs=50,verbose=1) model. save('mobydick_model.h5') dump(tokenizer,open('mysimpletokenzr','wb')) from keras.preprocessing.sequence import pad_sequences def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words): ''' INPUTS: model : model that was trained on text data tokenizer : tokenizer that was fit on text data seq_len : length of training sequence seed_text : raw string text to serve as the seed num_gen_words : number of words to be generated by model ''' ``` # Final Output output_text = [] ``` ``` # Intial Seed Sequence input_text = seed_text ``` ``` # Create num_gen_words for i in range(num_gen_words): ``` ``` # Take the input text string and encode it to a sequence encoded_text = tokenizer.texts_to_sequences([input_text])[0] ``` ``` # Pad sequences to our trained rate (50 words in the video) pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre') ``` ``` # Predict Class Probabilities for each word pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] ``` ``` # Grab word pred_word = tokenizer.index_word[pred_word_ind] ``` ``` # Update the sequence of input text (shifting one over with the new word) input_text += ' ' + pred_word ``` output_text.append(pred_word) ``` # Make it look like a sentence. return ' '.join(output_text) ``` import random random.seed(101) random_pick = random.randint(0, len(text_sequences)) random_seed_text = text_sequences[random_pick] seed_text = ' '.join(random_seed_text) generate_text(model, tokenizer, seq_len, seed_text, 25)

Answer 4

import pickle import numpy as np ``` with open('train_qa.txt', 'rb') as f: train_data = pickle.load(f) ``` ``` with open('test_qa.txt','rb') as f: test_data = pickle.load(f) ``` all_data = train_data + test_data vocab = set() for story, question, answer in all_data: vocab = vocab.union(set(story)) vocab = vocab.union(set(question)) vocab. add('no') vocab. add('yes') vocab_len = len(vocab) + 1 all_story_lens = [len(data[0]) for data in all_data] max_story_len = max(all_story_lens) max_question_len = max([len(data[1]) for data in all_data]) from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(filters=[]) tokenizer.fit_on_texts(vocab) def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len): X = [] Xq = [] Y = [] for story, query, answer in data: x = [word_index[word.lower()] for word in story] xq = [word_index[word.lower()] for word in query] y = np.zeros(len(word_index)+1) y[word_index[answer]] = 1 X.append(x) Xq.append(xq) Y.append(y) return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len), np.array(Y)) inputs_train, queries_train, answers_train = vectorize_stories(train_data) inputs_test, queries_test, answers_test = vectorize_stories(test_data) from keras.models import Sequential, Model from keras.layers.embeddings import Embedding from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM ``` input_sequence = Input((max_story_len,)) question = Input((max_question_len,)) ``` vocab_size = len(vocab) + 1 input_encoder_m = Sequential() input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64)) input_encoder_m.add(Dropout(0.3)) input_encoder_c = Sequential() input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len)) input_encoder_c.add(Dropout(0.3)) question_encoder = Sequential() question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64, input_length=max_question_len)) question_encoder.add(Dropout(0.3)) ``` input_encoded_m = input_encoder_m(input_sequence) input_encoded_c = input_encoder_c(input_sequence) question_encoded = question_encoder(question) ``` ``` match = dot([input_encoded_m,question_encoded],axes=(2,2)) match = Activation('softmax')(match) ``` ``` response = add([match,input_encoded_c]) response = Permute((2,1))(response) ``` answer = concatenate([response,question_encoded]) answer = LSTM(32)(answer) ``` answer = Dropout(0.5)(answer) answer = Dense(vocab_size)(answer) ``` answer = Activation('softmax')(answer) model = Model([input_sequence,question],answer) model. compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy']) model. summary() history = model.fit([inputs_train,queries_train],answers_train,batch_size=32,epochs=100,validation_data=([inputs_test,queries_test],answers_test)) pred_results = model.predict([inputs_test,queries_test]) my_story = 'John left the kitchen . Sandra dropped the football in the garden .' my_question = 'Is the football in the garden ?' mydata = [(my_story.split(),my_question.split(),'yes')] mystory, myques, myans = vectorize_stories(mydata) pred_results = model.predict([mystory, myques]) val_max = np.argmax(pred_results[0]) for key, val in tokenizer.word_index.items(): if val == val_max: k = key k

Answer 5

import numpy as np import pandas as pd import os row_list = [] for subdir in ['neg','pos']: for folder, subfolders, filenames in os.walk('../moviereviews/'+subdir): for file in filenames: d = {'label':subdir} # assign the name of the subdirectory to the label field with open('moviereviews/'+subdir+'/'+file) as f: if f.read(): # handles the case of empty files, which become NaN on import f.seek(0) d['review'] = f.read() # assign the contents of the file to the review field row_list.append(d) break df = pd.DataFrame(row_list)

Answer 6

docs = list(nlp.pipe(LOTS_OF_TEXTS))

Answer 7

``` # Disable tagger and parser with nlp.disable_pipes("tagger", "parser"): # Process the text and print the entities doc = nlp(text) print(doc.ents) ```

Answer 8

from spacy.tokens import Token ``` # Define getter function def get_is_color(token): colors = ["red", "yellow", "blue"] return token.text in colors ``` ``` # Set extension on the Token with getter Token.set_extension("is_color", getter=get_is_color) ``` doc = nlp("The sky is blue.") print(doc[3]._.is_color, "-", doc[3].text) from spacy.tokens import Span ``` # Define getter function def get_has_color(span): colors = ["red", "yellow", "blue"] return any(token.text in colors for token in span) ``` ``` # Set extension on the Span with getter Span.set_extension("has_color", getter=get_has_color) ``` doc = nlp("The sky is blue.") print(doc[1:4]._.has_color, "-", doc[1:4].text) print(doc[0:2]._.has_color, "-", doc[0:2].text) from spacy.tokens import Doc ``` # Define method with arguments def has_token(doc, token_text): in_doc = token_text in [token.text for token in doc] return in_doc ``` ``` # Set extension on the Doc with method Doc.set_extension("has_token", method=has_token) ``` doc = nlp("The sky is blue.") print(doc._.has_token("blue"), "- blue") print(doc._.has_token("cloud"), "- cloud")

Answer 9

TRAINING_DATA = [ ("How to preorder the iPhone X", {"entities": [(20, 28, "GADGET")]}) # And many more examples... ] # Loop for 10 iterations for i in range(10): # Shuffle the training data random.shuffle(TRAINING_DATA) # Create batches and iterate over them for batch in spacy.util.minibatch(TRAINING_DATA): # Split the batch in texts and annotations texts = [text for text, annotation in batch] annotations = [annotation for text, annotation in batch] # Update the model nlp.update(texts, annotations) ``` # Save the model nlp.to_disk(path_to_model) ```

NLP Flashcards

(33 cards)