Shakespearean English Semantics
Analyzing patterns in some of the British author's finest works.
- Project At A Glance
- Dependencies
- Initialization
- Text Pre-Processing
- Custom Word2Vec and Word-Similarity
- Generate Embedding DataFrame
- Word-Embedding Visualization
- Principal Component Analysis
Project At A Glance
Objective
: Evaluate and visualize relationships between annotations of the era's literature.
Data
: Compiled Shakespeare Dataset [Download]
Implementation
: Word2Vec, Word Embeddings, Principal Component Analysis
Results
:
- 100-parameter vectorized representation of every word in the corpus.
- Computed similarility scores for distinct words of the time period relative to today's English language.
- Scatter plots to visualize Word-Embeddings and their Principal Components.
Deployment
: View this project on GitHub.
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA
PATH = 'ShakespeareDataset.txt'
sw = stopwords.words('english')
plt.style.use('ggplot')
%%time
lines = []
with open(PATH, 'r') as f:
for l in f:
lines.append(l)
lines = [line.rstrip('\n') for line in lines]
lines = [line.lower() for line in lines]
lines = [line.translate(str.maketrans('', '', string.punctuation)) for line in lines]
%time lines = [word_tokenize(line) for line in lines]
def remove_stopwords(lines, sw = sw):
res = []
for line in lines:
original = line
line = [w for w in line if w not in sw]
if len(line) < 1:
line = original
res.append(line)
return res
%time filtered_lines = remove_stopwords(lines = lines, sw = sw)
%%time
w = w2v(
filtered_lines,
min_count=3,
sg = 1,
window=7
)
w.wv.most_similar('thou')
w.wv.most_similar('shall')
w.wv.most_similar('abhor')
w.wv.most_similar('vile')
%%time
emb_df = (
pd.DataFrame(
[w.wv.get_vector(str(n)) for n in w.wv.key_to_index],
index = w.wv.key_to_index
)
)
emb_df.shape
emb_df.head()
plt.clf()
fig=plt.figure(figsize=(6,4))
plt.scatter(
x = emb_df.iloc[:,0],
y = emb_df.iloc[:,1],
s = 0.2,
color = 'maroon',
alpha = 0.5
)
plt.title('Embedding Visualizations')
plt.show()
pca = PCA(n_components=2, random_state=7)
pca_model = pca.fit_transform(emb_df)
emb_df_PCA = (
pd.DataFrame(
pca_model,
columns=['x','y'],
index = emb_df.index
)
)
plt.clf()
fig = plt.figure(figsize=(6,4))
plt.scatter(
x = emb_df_PCA['x'],
y = emb_df_PCA['y'],
s = 0.4,
color = 'maroon',
alpha = 0.5
)
plt.xlabel('PCA-1')
plt.ylabel('PCA-2')
plt.title('PCA Visualization')
plt.plot()
plt.show()