# get df_len, stopwords
%run lda-short-text-preprocess.ipynb
n_topics = 30
topic_name = [str(i+1) for i in range(n_topics)]
%store topic_name
from wide_and_deep.lda import get_tab, corpus2df
df.columns
df.groupby(['text_level'])['case_id_new'].count().head(10)
# df = df[df['text_level'].isin([3,4])]
# df = df.head(10000)
df.shape
# pd.Series.dropna?
# text = np.hstack(
# (
# ("firstPart " + df["answer_part1"]).dropna().values,
# ("secondPart " + df["answer_part2"]).dropna().values,
# ("thirdPart " + df["answer_part3"]).dropna().values,
# )
# )
# x Part ID
# text = df['answer_part'].astype(str)+'part '+df["answer"].fillna(' ')
# xx Part Id
text = df["answer"].fillna(' ')
%store df
# (
# text.shape[0]
# == df["answer_part1"].dropna().values.shape[0]
# + df["answer_part2"].dropna().values.shape[0]
# + df["answer_part3"].dropna().values.shape[0]
# ),text.shape[0]
# corpus = pd.Series({'text':text}).apply(lambda x: jieba_cut(x, stopwords))
corpus = text.apply(lambda x: jieba_cut(x, stopwords))
# {1,n} xxx
# from gensim.models import Phrases
# from gensim.models.phrases import Phraser
# bigram = Phrases(corpus, min_count=5, threshold=10, delimiter=b'-')
# trigram = Phrases(bigram[corpus], min_count=5, threshold=10, delimiter=b'-')
# quadgram = Phrases(trigram[corpus], min_count=5, threshold=10, delimiter=b'-')
# bigram_phraser = Phraser(bigram)
# trigram_phraser = Phraser(trigram)
# quadgram = Phraser(quadgram)
# corpus = [quadgram[trigram[bigram_phraser[sent]] ] for sent in corpus]
from nltk.util import ngrams
corpus_new = []
for corpus_i in corpus:
corpus_output_i = ["-".join(i) for i in ngrams(corpus_i,5)]
corpus_new.append(corpus_output_i)
corpus = corpus_new
# https://stackoverflow.com/a/5829377
from collections import Counter
Counter([len(j.split('-')) for i in corpus for j in i])
corpus[0:20]
len(corpus),np.min([len(i) for i in corpus if i is not None])
dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in corpus]
lda = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics)
%store lda
%store corpus
%store dictionary
%store doc_term_matrix
# 8xxxxxx5xxxxx
tab = get_tab(lda)
tab.render_notebook()
from pprint import pprint
lda.print_topics(num_topics=30)
# distance : {'kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon'}
distance = ['kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon']
for i in distance:
topic_over_topic, annotation = lda.diff(lda, distance = i, annotation=True)
df_topic_over_topic = pd.DataFrame(topic_over_topic)
df_topic_over_topic.columns = df_topic_over_topic.index = topic_name
print(i)
print(df_topic_over_topic.iloc[0:5,0:5])
file_name = 'data/df_topic_over_topic_'+i+'.csv'
df_topic_over_topic.to_csv(file_name)
xxxx,xxxxx。
import matplotlib.pyplot as plt
import seaborn as sns
ax = sns.heatmap(df_topic_over_topic)
plt.yticks(np.arange(0.5, len(df_topic_over_topic.index), 1), df_topic_over_topic.index)
plt.xticks(np.arange(0.5, len(df_topic_over_topic.columns), 1), df_topic_over_topic.columns)
# turn the axis label
for item in ax.get_yticklabels():
item.set_rotation(0)
for item in ax.get_xticklabels():
item.set_rotation(90)
# save figure
plt.savefig('output/heatmap_topic_over_topic.png', dpi=100)
plt.show()
# joint model
with open("model/lda_model_sent_segmentation.pkl", 'wb') as fp:
pkl.dump(lda, fp)
# xxxxxx
%store n_topics
%run lda-short-text-w2v.ipynb
# joint model
# xxxxxxx
with open("model/lda_model_sent_segmentation.pkl", 'rb') as fp:
lda = pkl.load(fp)
df.shape
df.columns
def get_doc_term_matrix(some_corpus):
some_list = []
for rev in some_corpus:
if rev == '':
some_list.append(dictionary.doc2bow([rev]))
else:
some_list.append(dictionary.doc2bow(rev))
return some_list
# dictionary = corpora.Dictionary(corpus)
# export
def get_doc_topic_output(some_column):
some_corpus = df[some_column].apply(lambda x: jieba_cut(x,stopwords))
doc_term_matrix = get_doc_term_matrix(some_corpus)
corpus_output = lda.get_document_topics(doc_term_matrix)
return corpus_output
# corpus_output_part1 = get_doc_topic_output('answer_part1')
# corpus_output_part2 = get_doc_topic_output('answer_part2')
# corpus_output_part3 = get_doc_topic_output('answer_part3')
corpus_output = get_doc_topic_output('answer')
%store corpus_output
def get_topic_df(output):
all_topics_numpy = gensim.matutils.corpus2csc(output).T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy,columns=topic_name).reset_index(drop=True)
return all_topics_df
# subset_index = [i is not None and len(i)>0 for i in corpus]
# some_corpus = corpus[subset_index]
# some_corpus[0:10]
# xxxx
# df_part1 = get_topic_df(corpus_output_part1)
# df_part2 = get_topic_df(corpus_output_part2)
# df_part3 = get_topic_df(corpus_output_part3)
# subset
# df_part = get_topic_df(some_corpus[0:1])
# all
df_part = get_topic_df(corpus_output)
# output_col_name = (
# df.columns.tolist()
# + ["answer_part1" + "-topic-" + i for i in df_part1.columns.tolist()]
# + ["answer_part2" + "-topic-" + i for i in df_part2.columns.tolist()]
# + ["answer_part3" + "-topic-" + i for i in df_part3.columns.tolist()]
# )
output_col_name = (
df.columns.tolist()
+ ["answer_part" + "-topic-" + i for i in df_part.columns.tolist()]
)
# for i in [df,df_part1,df_part2,df_part3]:
# print(i.shape)
for i in [df,df_part]:
print(i.shape)
df.shape,df_part.shape
# df_output = pd.concat([df,df_part1,df_part2,df_part3], axis = 1)
df_output = pd.concat([df.reset_index(drop = True),df_part], axis = 1)
df_output.columns = output_col_name
df_output.shape
df_output.to_csv("data/document_with_topic_with_short_text.csv", index=False, encoding = "UTF-8")
df_output.head()
! jupyter nbconvert --to html --output-dir output lda-short-text.ipynb