{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
"doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
"doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
"doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
"doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
"\n",
"doc_complete = [doc1, doc2, doc3, doc4, doc5]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"import string\n",
"stop = set(stopwords.words('english'))\n",
"exclude = set(string.punctuation)\n",
"lemma = WordNetLemmatizer()\n",
"\n",
"def clean(doc):\n",
" stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
" punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
" normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
" return normalized\n",
"doc_clean = [clean(doc).split() for doc in doc_complete]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"from gensim import corpora\n",
"dictionary = corpora.Dictionary(doc_clean)\n",
"doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Lda = gensim.models.ldamodel.LdaModel\n",
"ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
]
}
],
"source": [
"print(ldamodel.print_topics(num_topics=3, num_words=4))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}