diff --git a/tm_utils.py b/tm_utils.py index 42f138b4dd01763b2e59916092386daaf1a3448b..e5a74735a9958218de1992f2b4331e5715f56169 100644 --- a/tm_utils.py +++ b/tm_utils.py @@ -266,7 +266,7 @@ def format_topics_sentences(ldamodel=None, corpus=None, texts=None): contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) sent_topics_df = sent_topics_df['Dominant_Topic'].fillna(0).apply(lambda x: str(int(x))) - return sent_topics_df + return(sent_topics_df) def convertldaGenToldaMallet(mallet_model): diff --git a/voeb_tm_wallnig.ipynb b/voeb_tm_wallnig.ipynb index 3469b7de9295daf0c81c4403a32de1967ad488a8..c66f14fa7cbb2dfe8dbfccfacee48c564266959a 100644 --- a/voeb_tm_wallnig.ipynb +++ b/voeb_tm_wallnig.ipynb @@ -421,16 +421,51 @@ }, { "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Now for the tables\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "metadata": {}, + "outputs": [], "source": [ - "### Now for the tables (datasette)\n", - "\n", - "Sometimes we would have to restart datasette with `systemctl start datasette`..." + "from operator import itemgetter\n", + "def format_topics_sentences(ldamodel=None, corpus=None, texts=None):\n", + " # cf. `testing_103.ipynb`\n", + " sent_topics_df = pd.DataFrame()\n", + " topics = {}\n", + " # Get main topic in each document\n", + " for i, row in enumerate(ldamodel[corpus]):\n", + " row_item = row[0]\n", + " # print(row_item)\n", + " row_item = sorted(row_item, key=itemgetter(1), reverse=True)[0]\n", + " topic_num, prop_topic = row_item\n", + " if not topic_num in topics:\n", + " wp = ldamodel.show_topic(topic_num)\n", + " topic_keywords = \", \".join([word for word, prop in wp])\n", + " topics[topic_num] = topic_keywords\n", + " else:\n", + " topic_keywords = topics[topic_num]\n", + " sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), \n", + " str(topic_num)+\": \" + topic_keywords]), ignore_index=True)\n", + " \n", + " sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']\n", + " contents = pd.Series(texts)\n", + " sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)\n", + " \n", + " sent_topics_df['Dominant_Topic'] = sent_topics_df['Dominant_Topic'].fillna(0).apply(lambda x: str(int(x)))\n", + " return(sent_topics_df)\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -440,153 +475,108 @@ "corpusname: voeb48-73_\n", "voeb48-73_20211028-175146 voeb48-73\n", "loading dict and corpus from data-tw/dict_voeb48-73_20211028-175146.dict, data-tw/corpus_voeb48-73_20211028-175146.mm\n", - "[(2, 0.2805531), (4, 0.026617138), (7, 0.5166085), (9, 0.031186195), (10, 0.1295674), (11, 0.015451236)]\n", - "[(4, 0.0335408), (7, 0.93391645), (9, 0.0260835)]\n", - "[(4, 0.059605595), (7, 0.9234904), (9, 0.016532116)]\n", - "[(2, 0.99909395)]\n", - "[(2, 0.76777035), (10, 0.2285372)]\n", - "[(2, 0.99807)]\n", - "[(2, 0.018939564), (4, 0.05009651), (7, 0.8701947), (9, 0.019532165), (10, 0.041053202)]\n", - "[(2, 0.30289793), (7, 0.38114664), (9, 0.03813178), (10, 0.27691138)]\n", - "[(4, 0.032640114), (7, 0.935043), (9, 0.02452857)]\n", - "[(4, 0.0550239), (7, 0.93037254), (9, 0.014232905)]\n", - "[(2, 0.9990953)]\n", - "[(2, 0.7647797), (10, 0.23157236)]\n", - "[(2, 0.8355817), (7, 0.11142462), (9, 0.023192778), (10, 0.0131209195), (11, 0.016423883)]\n", - "[(2, 0.46640286), (4, 0.040240686), (7, 0.271938), (9, 0.04701706), (10, 0.09569729), (11, 0.078543805)]\n", - "[(2, 0.092409864), (4, 0.04902445), (7, 0.78285456), (9, 0.054846834), (10, 0.02074028)]\n", - "[(2, 0.112780854), (4, 0.02586087), (7, 0.82400656), (9, 0.029357133)]\n", - "[(2, 0.6328255), (7, 0.34989017), (11, 0.016721573)]\n", - "[(2, 0.1505291), (4, 0.024832528), (7, 0.7936737), (10, 0.018778518)]\n", - "[(2, 0.32843506), (4, 0.040555365), (7, 0.53169554), (9, 0.043821767), (11, 0.047176685)]\n", - "[(2, 0.40401903), (4, 0.06405743), (7, 0.36368492), (9, 0.021000203), (10, 0.013611786), (11, 0.13355342)]\n", - "[(2, 0.3361721), (4, 0.14966382), (7, 0.36112234), (9, 0.030305833), (10, 0.034698848), (11, 0.087940045)]\n", - "[(2, 0.5824958), (4, 0.08177448), (10, 0.33350626)]\n", - "[(2, 0.56737334), (4, 0.099197485), (7, 0.20318973), (9, 0.023298362), (11, 0.10680778)]\n", - "[(2, 0.12868905), (4, 0.06824589), (7, 0.49881265), (9, 0.2887983), (10, 0.014769213)]\n", - "[(2, 0.62545335), (4, 0.12095001), (7, 0.07840649), (10, 0.06259749), (11, 0.11221576)]\n", - "[(2, 0.14459576), (4, 0.0936272), (9, 0.01156143), (11, 0.7498722)]\n", - "[(2, 0.45956668), (4, 0.06435885), (6, 0.07969774), (7, 0.16556579), (9, 0.06613689), (10, 0.03169823), (11, 0.13275075)]\n", - "[(2, 0.0347122), (11, 0.96488273)]\n", - "[(2, 0.088310905), (4, 0.062799305), (5, 0.014985357), (7, 0.15732811), (9, 0.028218329), (10, 0.039133493), (11, 0.6091809)]\n", - "[(2, 0.13958132), (4, 0.03976587), (7, 0.09445578), (9, 0.07991784), (10, 0.0612161), (11, 0.5847389)]\n", - "[(2, 0.43281084), (4, 0.100822575), (7, 0.03893598), (9, 0.026698781), (10, 0.28327924), (11, 0.11717471)]\n", - "[(2, 0.7818935), (4, 0.14960098), (7, 0.06715488)]\n", - "[(2, 0.90718), (4, 0.046793427), (9, 0.045456998)]\n", - "[(2, 0.34763563), (4, 0.3842686), (7, 0.09916011), (9, 0.034207687), (10, 0.07952978), (11, 0.055022098)]\n", - "[(4, 0.042264383), (7, 0.858117), (9, 0.09142882)]\n", - "[(4, 0.052148297), (7, 0.48781973), (9, 0.45966977)]\n", - "[(4, 0.055325173), (7, 0.49083987), (9, 0.45347628)]\n", - "[(2, 0.023911085), (4, 0.120421834), (7, 0.4024756), (9, 0.23378178), (10, 0.12844229), (11, 0.09075342)]\n", - "[(2, 0.36894888), (4, 0.04179211), (7, 0.1389178), (9, 0.041064333), (10, 0.40913212)]\n", - "[(2, 0.3261571), (4, 0.09141748), (7, 0.06889465), (10, 0.40162906), (11, 0.111012295)]\n", - "[(2, 0.24502474), (4, 0.1662812), (7, 0.10007411), (9, 0.057046663), (10, 0.06392365), (11, 0.36729845)]\n", - "[(4, 0.03527757), (7, 0.8000644), (9, 0.13965479), (10, 0.016864482)]\n", - "[(4, 0.046740916), (9, 0.068160124), (11, 0.87861127)]\n", - "[(4, 0.26713732), (7, 0.70290494), (10, 0.028992627)]\n", - "[(4, 0.09300666), (7, 0.7673158), (9, 0.13829629)]\n", - "[(4, 0.022928521), (9, 0.032768436), (10, 0.011241415), (11, 0.93288296)]\n", - "[(2, 0.25791875), (4, 0.11015291), (7, 0.2376752), (11, 0.39120385)]\n", - "[(2, 0.1337724), (4, 0.18047144), (5, 0.1493544), (7, 0.2036052), (10, 0.14679994), (11, 0.18432277)]\n", - "[(2, 0.3362658), (4, 0.48107362), (7, 0.18136439)]\n", - "[(2, 0.68358207), (4, 0.1382686), (7, 0.07193991), (9, 0.034910206), (10, 0.043409046), (11, 0.020519607)]\n", - "[(2, 0.09697018), (4, 0.2164108), (7, 0.2175626), (9, 0.0722713), (11, 0.39592153)]\n", - "[(4, 0.17859197), (7, 0.71225655), (9, 0.07464845), (10, 0.034257855)]\n", - "[(4, 0.16188428), (7, 0.7660489), (9, 0.071355864)]\n", - "[(4, 0.47196886), (7, 0.47376692), (9, 0.053329516)]\n", - "[(2, 0.83459514), (4, 0.09010547), (9, 0.0745168)]\n", - "[(2, 0.26260626), (4, 0.20297582), (5, 0.06683364), (7, 0.09248767), (9, 0.053551324), (11, 0.32117915)]\n", - "[(2, 0.0151258465), (4, 0.0487812), (5, 0.01280832), (7, 0.84131074), (9, 0.076691516)]\n", - "[(2, 0.39000282), (4, 0.18601581), (5, 0.06887991), (7, 0.15141004), (9, 0.07428958), (11, 0.12027027)]\n", - "[(2, 0.31650478), (4, 0.24970184), (5, 0.046464406), (7, 0.13724995), (10, 0.24818428)]\n", - "[(4, 0.22466533), (6, 0.117342986), (7, 0.09998731), (10, 0.16637614), (11, 0.38846293)]\n", - "[(2, 0.09362584), (3, 0.1238802), (4, 0.3593046), (7, 0.3338605), (9, 0.088648304)]\n", - "[(0, 0.22675493), (2, 0.06712009), (4, 0.31200948), (7, 0.2582018), (9, 0.066289075), (10, 0.06149375)]\n", - "[(0, 0.22614719), (4, 0.36301723), (7, 0.33477435), (9, 0.07516922)]\n", - "[(2, 0.53701824), (4, 0.3603416), (5, 0.04599441), (10, 0.055573378)]\n", - "[(2, 0.3383623), (4, 0.17071772), (5, 0.28385293), (7, 0.17929342), (9, 0.026837116)]\n", - "[(2, 0.35674962), (4, 0.37027392), (7, 0.15435497), (9, 0.05567779), (10, 0.046111044), (11, 0.016374981)]\n", - "[(2, 0.400412), (4, 0.09259629), (7, 0.1721627), (9, 0.33391258)]\n", - "[(2, 0.36706805), (4, 0.06187569), (7, 0.11543792), (9, 0.40301746), (10, 0.012816688), (11, 0.039422754)]\n", - "[(2, 0.14874376), (4, 0.123410955), (7, 0.2002042), (9, 0.394779), (10, 0.13235474)]\n", - "[(7, 0.14047143), (9, 0.084204175), (10, 0.7734269)]\n", - "[(4, 0.6215544), (7, 0.042740345), (9, 0.105777524), (10, 0.08722626), (11, 0.14207742)]\n", - "[(0, 0.020146385), (4, 0.2174245), (7, 0.29464835), (9, 0.20780663), (11, 0.2591272)]\n", - "[(2, 0.39141038), (4, 0.5413344), (10, 0.06588475)]\n", - "[(2, 0.29253194), (4, 0.14046106), (5, 0.27256966), (7, 0.08628638), (9, 0.17593509), (11, 0.03168814)]\n", - "[(2, 0.091311015), (4, 0.22958764), (9, 0.1416632), (10, 0.040435653), (11, 0.49630657)]\n", - "[(2, 0.32591188), (4, 0.14575388), (5, 0.010086877), (7, 0.10935142), (9, 0.10230067), (10, 0.26048017), (11, 0.045984503)]\n", - "[(9, 0.024144225), (11, 0.97554266)]\n", - "[(2, 0.182749), (4, 0.48231214), (7, 0.100606464), (9, 0.07756725), (10, 0.026663717), (11, 0.12979878)]\n", - "[(2, 0.24598487), (4, 0.28187233), (7, 0.12888093), (9, 0.11054232), (10, 0.23169671)]\n", - "[(2, 0.373515), (4, 0.18770917), (5, 0.020417225), (7, 0.1690224), (9, 0.12493417), (10, 0.11487046)]\n", - "[(2, 0.37631118), (4, 0.20933467), (5, 0.21088243), (7, 0.03913531), (9, 0.16323185)]\n", - "[(2, 0.33449683), (4, 0.045258284), (9, 0.22064714), (11, 0.3981758)]\n", - "[(2, 0.2638961), (4, 0.5413212), (7, 0.107675515), (9, 0.031046528), (10, 0.055511333)]\n", - "[(4, 0.0828329), (7, 0.64052755), (9, 0.21048824), (10, 0.062462017)]\n", - "[(2, 0.043114707), (4, 0.2607085), (7, 0.23128694), (10, 0.43940273), (11, 0.021594316)]\n", - "[(2, 0.36785796), (4, 0.1603459), (9, 0.44459584), (10, 0.025889214)]\n", - "[(2, 0.28757623), (4, 0.15526289), (7, 0.15263833), (8, 0.0376659), (9, 0.27751094), (10, 0.08849836)]\n", - "[(2, 0.39055955), (4, 0.58292824), (5, 0.02370643)]\n", - "[(4, 0.17245817), (7, 0.16922893), (9, 0.6537295)]\n", - "[(2, 0.7588662), (4, 0.07487608), (5, 0.040221617), (9, 0.064358346), (10, 0.061249927)]\n", - "[(2, 0.33220628), (4, 0.05561143), (5, 0.43853623), (9, 0.17216116)]\n", - "[(2, 0.28272012), (4, 0.18492875), (5, 0.07377189), (7, 0.21272132), (9, 0.052982252), (10, 0.19233702)]\n", - "[(2, 0.21442762), (4, 0.4965944), (7, 0.08489679), (9, 0.1397767), (11, 0.06360585)]\n", - "[(2, 0.29686487), (4, 0.12553315), (9, 0.07112925), (10, 0.013048349), (11, 0.4931021)]\n", - "[(2, 0.78269356), (4, 0.10137049), (9, 0.115548015)]\n", - "[(2, 0.2884586), (4, 0.1481043), (7, 0.17265686), (9, 0.31642842), (11, 0.07348688)]\n", - "[(2, 0.409713), (4, 0.17038544), (7, 0.08065667), (9, 0.25919116), (10, 0.027758613), (11, 0.05205198)]\n", - "[(2, 0.21934862), (4, 0.16837862), (7, 0.08992787), (9, 0.08549616), (10, 0.40364566), (11, 0.032902677)]\n", - "[(2, 0.4154466), (4, 0.34010348), (7, 0.13885087), (9, 0.08750115), (11, 0.01742183)]\n", - "[(2, 0.17591386), (4, 0.29826498), (7, 0.0637388), (9, 0.098285966), (10, 0.3572086)]\n", - "[(5, 0.9466116), (9, 0.051640823)]\n", - "[(4, 0.9819012), (10, 0.016341466)]\n", - "[(2, 0.03543197), (4, 0.9460082), (11, 0.015636228)]\n", - "[(4, 0.010969139), (5, 0.22666751), (9, 0.7608348)]\n", - "[(4, 0.013159093), (9, 0.9839817)]\n", - "[(4, 0.9210771), (9, 0.077488765)]\n", - "[(4, 0.088405326), (9, 0.91085476)]\n", - "[(4, 0.5857286), (9, 0.40695855)]\n", - "[(9, 0.07238506), (10, 0.92089194)]\n", - "[(4, 0.010254808), (9, 0.9852471)]\n", - "[(4, 0.94235295), (9, 0.05665622)]\n", - "[(9, 0.9980122)]\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'Series' object has no attribute 'columns'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_29650/1374924362.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mdf_topic_sent_keywords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtm_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat_topics_sentences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldamodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlda_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_topic_sent_keywords\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0mdf_topic_sent_keywords\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Dominant_Topic'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_topic_sent_keywords\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Dominant_Topic'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;31m# Format\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/tljh/user/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5463\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5464\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5465\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5467\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'columns'" + "Index(['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 0], dtype='object')\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n", - "/opt/tljh/user/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", - " from imp import reload\n" - ] + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Document_No</th>\n", + " <th>Dominant_Topic</th>\n", + " <th>Topic_Perc_Contrib</th>\n", + " <th>Keywords</th>\n", + " <th>Text</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>7</td>\n", + " <td>0.5166</td>\n", + " <td>7: osterr, titel, vereinigung, mitteilungen, d...</td>\n", + " <td>\\nScnftfalen\\n\\n\\ne,/.\\ndei 3et\\n\\nARBEITSPRO...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>0.9339</td>\n", + " <td>7: osterr, titel, vereinigung, mitteilungen, d...</td>\n", + " <td>\\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHISCH...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>7</td>\n", + " <td>0.9235</td>\n", + " <td>7: osterr, titel, vereinigung, mitteilungen, d...</td>\n", + " <td>\\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHISCH...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>2</td>\n", + " <td>0.9991</td>\n", + " <td>2: vereinigung, osterr, mitglieder, vorsitzend...</td>\n", + " <td>\\n\\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHIS...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>0.7678</td>\n", + " <td>2: vereinigung, osterr, mitglieder, vorsitzend...</td>\n", + " <td>\\n\\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHIS...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Document_No Dominant_Topic Topic_Perc_Contrib \\\n", + "0 0 7 0.5166 \n", + "1 1 7 0.9339 \n", + "2 2 7 0.9235 \n", + "3 3 2 0.9991 \n", + "4 4 2 0.7678 \n", + "\n", + " Keywords \\\n", + "0 7: osterr, titel, vereinigung, mitteilungen, d... \n", + "1 7: osterr, titel, vereinigung, mitteilungen, d... \n", + "2 7: osterr, titel, vereinigung, mitteilungen, d... \n", + "3 2: vereinigung, osterr, mitglieder, vorsitzend... \n", + "4 2: vereinigung, osterr, mitglieder, vorsitzend... \n", + "\n", + " Text \n", + "0 \\nScnftfalen\\n\\n\\ne,/.\\ndei 3et\\n\\nARBEITSPRO... \n", + "1 \\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHISCH... \n", + "2 \\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHISCH... \n", + "3 \\n\\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHIS... \n", + "4 \\n\\nMITTEILUNGEN\\nDER VEREINIGUNG ÖSTERREICHIS... " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -604,7 +594,7 @@ "\n", "# we should create a column with the date/vol in it (also necessary for timeline)\n", "\n", - "df_topic_sent_keywords = tm_utils.format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)\n", + "df_topic_sent_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)\n", "print(df_topic_sent_keywords.columns)\n", "df_topic_sent_keywords['Dominant_Topic'] = df_topic_sent_keywords['Dominant_Topic'].fillna(0).apply(lambda x: str(int(x)))\n", "# Format\n", @@ -615,7 +605,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -646,13 +636,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "28\n", "Table voeb48-73_most_representative_topics in jbekesi.db updated/created\n" ] } @@ -665,9 +656,10 @@ "for i, grp in sent_topics_outdf_grpd:\n", " most_repr = pd.concat([most_repr, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], \n", " axis=0)\n", - "\n", + "print(most_repr.size)\n", "# Reset Index \n", "most_repr.reset_index(inplace=True)\n", + "most_repr.head()\n", "most_repr.set_index('index', drop=False, inplace=True)\n", "#sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)\n", "# Format\n", @@ -677,7 +669,7 @@ "most_repr.head()\n", "output_table = \"{}_most_representative_topics\".format(corpusname)\n", "output_csv = str(DATA.joinpath(output_table + \".csv\"))\n", - "most_repr.to_csv(put_csv, sep=\";\", index=False)\n", + "most_repr.to_csv(output_csv, sep=\";\", index=False)\n", "tm_utils.csv_to_datasette(tablename=output_table, csv=output_csv, db=None)" ] }, @@ -692,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -724,7 +716,7 @@ "df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']\n", "\n", "# Show\n", - "# df_dominant_topics\n", + "df_dominant_topics.head()\n", "output_table = \"{}_distribution_of_topics\".format(corpusname)\n", "output_csv = str(DATA.joinpath(output_table + \".csv\"))\n", "df_dominant_topics.to_csv(output_csv, sep=\";\", index=False)\n", @@ -767,9 +759,6 @@ } ], "source": [ - "#\n", - "# ok, we use mallet for this until we know how to reproduce it with gensim...\n", - "# \n", "data_lemmatized = tm_utils.get_lemmatized(corpusname=corpusname, datadir=DATA)\n", "id2word, corpus = tm_utils.get_corpus_dictionary(data_lemmatized, \n", " corpusname=corpusname, save=False, \n",