In [3]:
# get df_len, stopwords
%run lda-short-text-preprocess.ipynb
d:\install\miniconda\lib\site-packages\statsmodels\tools\_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic
In [4]:
n_topics = 30
topic_name = [str(i+1) for i in range(n_topics)]
In [56]:
%store topic_name
Stored 'topic_name' (list)
In [5]:
from wide_and_deep.lda import get_tab, corpus2df
In [6]:
df.columns
Out[6]:
Index(['case_id_new', 'text_level', 'calluuid', 'answer', 'answer_part'], dtype='object')
In [7]:
df.groupby(['text_level'])['case_id_new'].count().head(10)
Out[7]:
text_level
3     422547
4     386142
5     326280
6     263495
7     210010
8     167988
9     134769
10    109548
11     89557
12     74026
Name: case_id_new, dtype: int64
In [8]:
# df = df[df['text_level'].isin([3,4])]
# df = df.head(10000)
In [9]:
df.shape
Out[9]:
(2668243, 5)
In [10]:
# pd.Series.dropna?
In [11]:
# text = np.hstack(
#     (
#         ("firstPart " + df["answer_part1"]).dropna().values,
#         ("secondPart " + df["answer_part2"]).dropna().values,
#         ("thirdPart " + df["answer_part3"]).dropna().values,
#     )
# )
# x Part ID
# text = df['answer_part'].astype(str)+'part '+df["answer"].fillna(' ')
# xx Part Id
text = df["answer"].fillna(' ')
In [55]:
%store df
Stored 'df' (DataFrame)
In [12]:
# (
#     text.shape[0]
#     == df["answer_part1"].dropna().values.shape[0]
#     + df["answer_part2"].dropna().values.shape[0]
#     + df["answer_part3"].dropna().values.shape[0]
# ),text.shape[0]
In [60]:
# corpus = pd.Series({'text':text}).apply(lambda x: jieba_cut(x, stopwords))
corpus = text.apply(lambda x: jieba_cut(x, stopwords))
In [61]:
# {1,n} xxx
# from gensim.models import Phrases
# from gensim.models.phrases import Phraser

# bigram = Phrases(corpus, min_count=5, threshold=10, delimiter=b'-')
# trigram = Phrases(bigram[corpus], min_count=5, threshold=10, delimiter=b'-')
# quadgram = Phrases(trigram[corpus], min_count=5, threshold=10, delimiter=b'-')

# bigram_phraser = Phraser(bigram)
# trigram_phraser = Phraser(trigram)
# quadgram = Phraser(quadgram)
# corpus = [quadgram[trigram[bigram_phraser[sent]] ] for sent in corpus]
In [62]:
from nltk.util import ngrams

corpus_new = []
for corpus_i in corpus:
    corpus_output_i = ["-".join(i) for i in ngrams(corpus_i,5)]
    corpus_new.append(corpus_output_i)
In [63]:
corpus = corpus_new
In [64]:
# https://stackoverflow.com/a/5829377
from collections import Counter
Counter([len(j.split('-')) for i in corpus for j in i])
Out[64]:
Counter({5: 6123105})
In [65]:
corpus[0:20]
Out[65]:
[[],
 [],
 ['xxx-xx-x-xx-xx',
  'xx-x-xx-xx-xx',
  'x-xx-xx-xx-x',
  'xx-xx-xx-x-xx',
  'xx-xx-x-xx-xx',
  'xx-x-xx-xx-x'],
 [],
 ['xx-xx-x-xx-xx',
  'xx-x-xx-xx-xx',
  'x-xx-xx-xx-x',
  'xx-xx-xx-x-xxxx',
  'xx-xx-x-xxxx-xxx',
  'xx-x-xxxx-xxx-xx'],
 ['x-xx-xx-xx-xx',
  'xx-xx-xx-xx-xxx',
  'xx-xx-xx-xxx-xx',
  'xx-xx-xxx-xx-xx',
  'xx-xxx-xx-xx-xx'],
 ['xx-xxx-xx-xx-xx',
  'xxx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx'],
 [],
 ['xxx-xx-x-xx-xx', 'xx-x-xx-xx-xx', 'x-xx-xx-xx-xx', 'xx-xx-xx-xx-xxx'],
 ['xx-xxx-xxx-xx-xx', 'xxx-xxx-xx-xx-xx', 'xxx-xx-xx-xx-x', 'xx-xx-xx-x-xx'],
 [],
 ['x-x-xx-xx-xx', 'x-xx-xx-xx-x'],
 [],
 [],
 ['xx-x-xx-xx-x', 'x-xx-xx-x-xx', 'xx-xx-x-xx-x'],
 ['xx-xx-xx-x-xx'],
 [],
 [],
 ['x-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-xx',
  'xx-xx-xx-xx-x',
  'xx-xx-xx-x-xx'],
 ['xx-xxx-xx-xx-xxx', 'xxx-xx-xx-xxx-xx', 'xx-xx-xxx-xx-xx']]
In [66]:
len(corpus),np.min([len(i) for i in corpus if i is not None])
Out[66]:
(2668243, 0)
In [67]:
dictionary = corpora.Dictionary(corpus)
In [68]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in corpus]
In [69]:
lda = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics)
In [70]:
%store lda
%store corpus
%store dictionary
%store doc_term_matrix
Stored 'lda' (LdaModel)
Stored 'corpus' (list)
Stored 'dictionary' (Dictionary)
Stored 'doc_term_matrix' (list)
In [71]:
# 8xxxxxx5xxxxx
tab = get_tab(lda)
tab.render_notebook()
Out[71]:
In [72]:
from pprint import pprint
In [73]:
lda.print_topics(num_topics=30)
Out[73]:
[(0,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (1,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (2,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (3,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (4,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (5,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (6,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (7,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (8,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (9,
  '0.001*"xx-xx-xx-xx-xxx" + 0.001*"xx-xx-xx-xx-xx" + 0.001*"xx-xx-xx-xxx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x"'),
 (10,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (11,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (12,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (13,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (14,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (15,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (16,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (17,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (18,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (19,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (20,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (21,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (22,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (23,
  '0.000*"xx-xx-xxx-xx-x" + 0.000*"xx-x-xx-xx-xx" + 0.000*"xx-x-xx-xx-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"xx-xx-x-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (24,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (25,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (26,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (27,
  '0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (28,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"'),
 (29,
  '0.000*"xx-app-xx-xx-xx" + 0.000*"xx-xx-x-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"app-xx-xx-xx-xxx" + 0.000*"xx-xx-xx-xxx-x" + 0.000*"xx-xx-xx-xx-xx" + 0.000*"xx-xx-xxx-x-x" + 0.000*"x-xx-xx-xx-xx" + 0.000*"xx-xx-xx-xx-xx"')]
In [74]:
# distance : {'kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon'}
distance = ['kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon']
In [75]:
for i in distance:
    topic_over_topic, annotation = lda.diff(lda, distance = i, annotation=True)
    df_topic_over_topic = pd.DataFrame(topic_over_topic)
    df_topic_over_topic.columns = df_topic_over_topic.index = topic_name
    print(i)
    print(df_topic_over_topic.iloc[0:5,0:5])
    file_name = 'data/df_topic_over_topic_'+i+'.csv'
    df_topic_over_topic.to_csv(file_name)
kullback_leibler
     1    2    3    4    5
1  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0
hellinger
     1    2    3    4    5
1  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0
jaccard
     1    2    3    4    5
1  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0
jensen_shannon
     1    2    3    4    5
1  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0

xxxx,xxxxx。

In [76]:
import matplotlib.pyplot as plt
import seaborn as sns
ax = sns.heatmap(df_topic_over_topic)

plt.yticks(np.arange(0.5, len(df_topic_over_topic.index), 1), df_topic_over_topic.index)
plt.xticks(np.arange(0.5, len(df_topic_over_topic.columns), 1), df_topic_over_topic.columns)

# turn the axis label
for item in ax.get_yticklabels():
    item.set_rotation(0)

for item in ax.get_xticklabels():
    item.set_rotation(90)

# save figure
plt.savefig('output/heatmap_topic_over_topic.png', dpi=100)
plt.show()

prediction

In [77]:
# joint model
with open("model/lda_model_sent_segmentation.pkl", 'wb') as fp:
    pkl.dump(lda, fp)
In [78]:
# xxxxxx 
%store n_topics
%run lda-short-text-w2v.ipynb
Stored 'n_topics' (int)
The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic
The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic
topic id1
(1, 300)
topic id2
(1, 300)
topic id3
(1, 300)
topic id4
(1, 300)
topic id5
(1, 300)
topic id6
(1, 300)
topic id7
(1, 300)
topic id8
(1, 300)
topic id9
(1, 300)
topic id10
(1, 300)
topic id11
(1, 300)
topic id12
(1, 300)
topic id13
(1, 300)
topic id14
(1, 300)
topic id15
(1, 300)
topic id16
(1, 300)
topic id17
(1, 300)
topic id18
(1, 300)
topic id19
(1, 300)
topic id20
(1, 300)
topic id21
(1, 300)
topic id22
(1, 300)
topic id23
(1, 300)
topic id24
(1, 300)
topic id25
(1, 300)
topic id26
(1, 300)
topic id27
(1, 300)
topic id28
(1, 300)
topic id29
(1, 300)
topic id30
(1, 300)
In [79]:
# joint model
# xxxxxxx
with open("model/lda_model_sent_segmentation.pkl", 'rb') as fp:
    lda = pkl.load(fp)
In [80]:
df.shape
Out[80]:
(2668243, 5)
In [81]:
df.columns
Out[81]:
Index(['case_id_new', 'text_level', 'calluuid', 'answer', 'answer_part'], dtype='object')
In [82]:
def get_doc_term_matrix(some_corpus):
    some_list = []
    for rev in some_corpus:
        if rev == '':
            some_list.append(dictionary.doc2bow([rev]))
        else:
            some_list.append(dictionary.doc2bow(rev))
    return some_list
In [83]:
# dictionary = corpora.Dictionary(corpus)
In [84]:
# export
def get_doc_topic_output(some_column):
    some_corpus = df[some_column].apply(lambda x: jieba_cut(x,stopwords))
    doc_term_matrix = get_doc_term_matrix(some_corpus)
    corpus_output = lda.get_document_topics(doc_term_matrix)
    return corpus_output
In [85]:
# corpus_output_part1 = get_doc_topic_output('answer_part1')
# corpus_output_part2 = get_doc_topic_output('answer_part2')
# corpus_output_part3 = get_doc_topic_output('answer_part3')
corpus_output = get_doc_topic_output('answer')
In [86]:
%store corpus_output
Stored 'corpus_output' (TransformedCorpus)
In [87]:
def get_topic_df(output):
    all_topics_numpy = gensim.matutils.corpus2csc(output).T.toarray()
    all_topics_df = pd.DataFrame(all_topics_numpy,columns=topic_name).reset_index(drop=True)    
    return all_topics_df
In [88]:
# subset_index = [i is not None and len(i)>0 for i in corpus]

# some_corpus = corpus[subset_index]

# some_corpus[0:10]
In [89]:
# xxxx
# df_part1 = get_topic_df(corpus_output_part1)
# df_part2 = get_topic_df(corpus_output_part2)
# df_part3 = get_topic_df(corpus_output_part3)
# subset
# df_part = get_topic_df(some_corpus[0:1])
# all
df_part = get_topic_df(corpus_output)
In [90]:
# output_col_name = (
#     df.columns.tolist()
#     + ["answer_part1" + "-topic-" + i for i in df_part1.columns.tolist()]
#     + ["answer_part2" + "-topic-" + i for i in df_part2.columns.tolist()]
#     + ["answer_part3" + "-topic-" + i for i in df_part3.columns.tolist()]
# )
output_col_name = (
    df.columns.tolist()
    + ["answer_part" + "-topic-" + i for i in df_part.columns.tolist()]
)
In [91]:
# for i in [df,df_part1,df_part2,df_part3]:
#     print(i.shape)
for i in [df,df_part]:
    print(i.shape)
(2668243, 5)
(2668243, 30)
In [92]:
df.shape,df_part.shape
Out[92]:
((2668243, 5), (2668243, 30))
In [93]:
# df_output = pd.concat([df,df_part1,df_part2,df_part3], axis = 1)    
df_output = pd.concat([df.reset_index(drop = True),df_part], axis = 1)    
In [94]:
df_output.columns = output_col_name
In [95]:
df_output.shape
Out[95]:
(2668243, 35)
In [96]:
df_output.to_csv("data/document_with_topic_with_short_text.csv", index=False, encoding = "UTF-8")
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-96-c1f109ca97c5> in <module>
----> 1 df_output.to_csv("data/document_with_topic_with_short_text.csv", index=False, encoding = "UTF-8")

d:\install\miniconda\lib\site-packages\pandas\core\generic.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal)
   3202             decimal=decimal,
   3203         )
-> 3204         formatter.save()
   3205 
   3206         if path_or_buf is None:

d:\install\miniconda\lib\site-packages\pandas\io\formats\csvs.py in save(self)
    202             )
    203 
--> 204             self._save()
    205 
    206         finally:

d:\install\miniconda\lib\site-packages\pandas\io\formats\csvs.py in _save(self)
    321                 break
    322 
--> 323             self._save_chunk(start_i, end_i)
    324 
    325     def _save_chunk(self, start_i: int, end_i: int) -> None:

d:\install\miniconda\lib\site-packages\pandas\io\formats\csvs.py in _save_chunk(self, start_i, end_i)
    352         )
    353 
--> 354         libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)

pandas\_libs\writers.pyx in pandas._libs.writers.write_csv_rows()

OSError: [Errno 28] No space left on device
In [ ]:
df_output.head()
In [ ]:
! jupyter nbconvert --to html --output-dir output lda-short-text.ipynb
In [ ]: