-
Book Overview & Buying
-
Table Of Contents
-
Feedback & Rating

Data Science with Python
By :

Solution:
import pandas as pd
data = pd.read_csv('../../chapter 7/data/movie_reviews.csv', encoding='latin-1')
data.text = data.text.str.lower()
Keep in mind that "Hello" and "hellow" are not the same to a computer.
import re
def clean_str(string):
string = re.sub(r"https?\://\S+", '', string)
string = re.sub(r'\<a href', ' ', string)
string = re.sub(r'&', '', string)
string = re.sub(r'<br />', ' ', string)
string = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', string)
string = re.sub('\d','', string)
string = re.sub(r"can\'t", "cannot", string)
string = re.sub(r"it\'s", "it is", string)
return string
data.SentimentText = data.SentimentText.apply(lambda x: clean_str(str(x)))
To see how we found these, words refer to Exercise 51.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
stop_words = stopwords.words('english') + ['movie', 'film', 'time']
stop_words = set(stop_words)
remove_stop_words = lambda r: [[word for word in word_tokenize(sente) if word not in stop_words] for sente in sent_tokenize(r)]
data['SentimentText'] = data['SentimentText'].apply(remove_stop_words)
from gensim.models import Word2Vec
model = Word2Vec(
data['SentimentText'].apply(lambda x: x[0]),
iter=10,
size=16,
window=5,
min_count=5,
workers=10)
model.wv.save_word2vec_format('movie_embedding.txt', binary=False)
def combine_text(text):
try:
return ' '.join(text[0])
except:
return np.nan
data.SentimentText = data.SentimentText.apply(lambda x: combine_text(x))
data = data.dropna(how='any')
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(data['SentimentText']))
sequences = tokenizer.texts_to_sequences(data['SentimentText'])
word_index = tokenizer.word_index
from keras.preprocessing.sequence import pad_sequences
reviews = pad_sequences(sequences, maxlen=100)
import numpy as np
def load_embedding(filename, word_index , num_words, embedding_dim):
embeddings_index = {}
file = open(filename, encoding="utf-8")
for line in file:
values = line.split()
word = values[0]
coef = np.asarray(values[1:])
embeddings_index[word] = coef
file.close()
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, pos in word_index.items():
if pos >= num_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[pos] = embedding_vector
return embedding_matrix
embedding_matrix = load_embedding('movie_embedding.txt', word_index, len(word_index), 16)
from sklearn.model_selection import train_test_split
labels = pd.get_dummies(data.Sentiment)
X_train, X_test, y_train, y_test = train_test_split(reviews,labels, test_size=0.2, random_state=9)
from keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten
from keras.models import Model
inp = Input((100,))
embedding_layer = Embedding(len(word_index),
16,
weights=[embedding_matrix],
input_length=100,
trainable=False)(inp)
model = Flatten()(embedding_layer)
model = BatchNormalization()(model)
model = Dropout(0.10)(model)
model = Dense(units=1024, activation='relu')(model)
model = Dense(units=256, activation='relu')(model)
model = Dropout(0.5)(model)
predictions = Dense(units=2, activation='softmax')(model)
model = Model(inputs = inp, outputs = predictions)
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics = ['acc'])
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs=10, batch_size=256)
from sklearn.metrics import accuracy_score
preds = model.predict(X_test)
accuracy_score(np.argmax(preds, 1), np.argmax(y_test.values, 1))
The accuracy of the model is:
y_actual = pd.Series(np.argmax(y_test.values, axis=1), name='Actual')
y_pred = pd.Series(np.argmax(preds, axis=1), name='Predicted')
pd.crosstab(y_actual, y_pred, margins=True)
Check the following
review_num = 111
print("Review: \n"+tokenizer.sequences_to_texts([X_test[review_num]])[0])
sentiment = "Positive" if np.argmax(preds[review_num]) else "Negative"
print("\nPredicted sentiment = "+ sentiment)
sentiment = "Positive" if np.argmax(y_test.values[review_num]) else "Negative"
print("\nActual sentiment = "+ sentiment)
Check that you receive the following output:
Solution:
import pandas as pd
data = pd.read_csv('tweet-data.csv', encoding='latin-1', header=None)
data.columns = ['sentiment', 'id', 'date', 'q', 'user', 'text']
data = data.drop(['id', 'date', 'q', 'user'], axis=1)
data = data.sample(400000).reset_index(drop=True)
data.text = data.text.str.lower()
import re
def clean_str(string):
string = re.sub(r"https?\://\S+", '', string)
string = re.sub(r"@\w*\s", '', string)
string = re.sub(r'\<a href', ' ', string)
string = re.sub(r'&', '', string)
string = re.sub(r'<br />', ' ', string)
string = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', string)
string = re.sub('\d','', string)
return string
data.text = data.text.apply(lambda x: clean_str(str(x)))
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
stop_words = stopwords.words('english')
stop_words = set(stop_words)
remove_stop_words = lambda r: [[word for word in word_tokenize(sente) if word not in stop_words] for sente in sent_tokenize(r)]
data['text'] = data['text'].apply(remove_stop_words)
def combine_text(text):
try:
return ' '.join(text[0])
except:
return np.nan
data.text = data.text.apply(lambda x: combine_text(x))
data = data.dropna(how='any')
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(data['text']))
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
from keras.preprocessing.sequence import pad_sequences
tweets = pad_sequences(sequences, maxlen=50)
import numpy as np
def load_embedding(filename, word_index , num_words, embedding_dim):
embeddings_index = {}
file = open(filename, encoding="utf-8")
for line in file:
values = line.split()
word = values[0]
coef = np.asarray(values[1:])
embeddings_index[word] = coef
file.close()
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, pos in word_index.items():
if pos >= num_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[pos] = embedding_vector
return embedding_matrix
embedding_matrix = load_embedding('../../embedding/glove.twitter.27B.50d.txt', word_index, len(word_index), 50)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tweets, pd.get_dummies(data.sentiment), test_size=0.2, random_state=9)
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Embedding, Flatten, LSTM
embedding_layer = Embedding(len(word_index),
50,
weights=[embedding_matrix],
input_length=50,
trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics = ['acc'])
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs=10, batch_size=256)
preds = model.predict(X_test)
review_num = 1
print("Tweet: \n"+tokenizer.sequences_to_texts([X_test[review_num]])[0])
sentiment = "Positive" if np.argmax(preds[review_num]) else "Negative"
print("\nPredicted sentiment = "+ sentiment)
sentiment = "Positive" if np.argmax(y_test.values[review_num]) else "Negative"
print("\nActual sentiment = "+ sentiment)
The output is as follows:
Change the font size
Change margin width
Change background colour