diff --git "a/data/topic_charts.json" "b/data/topic_charts.json" --- "a/data/topic_charts.json" +++ "b/data/topic_charts.json" @@ -1,4515 +1,5433 @@ [ - [ - "Number of Document of Each Topic", - { - "type": "pie", - "kwargs": { - "x": [ - 535838, - 206990, - 368022, - 200460, - 435310, - 250450, - 933732, - 271801, - 639890, - 387594, - 271359, - 1473798, - 459519, - 1101903, - 31659, - 2254859, - 591041 - ], - "labels": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "autopct": "%1.1f%%", - "colors": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ], - "pctdistance": 1.2, - "labeldistance": 1.5 - }, - "comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics." - } - ], - [ - "Fraction of Words Corrected in Lines", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.005599317351029421, - 0.005491440909735792, - 0.010611897213357221, - 0.0061721529486005915, - 0.005040363960665401, - 0.0042498218252128035, - 0.008174887952855342, - 0.005098232906967347, - 0.005905725848762689, - 0.008048438948020924, - 0.005920233062429675, - 0.00738773833987446, - 0.006788916830535338, - 0.007824824620615435, - 0.007817009319252808, - 0.006894261391191716, - 0.007759051322619051 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines." - } - ], - [ - "Fraction of Lines Ending with Ellipsis", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.013608683903284204, - 0.01187771888948645, - 0.010704198151112872, - 0.013181499370177098, - 0.012342863597933462, - 0.01669603038717465, - 0.013958760786106517, - 0.011481605295821474, - 0.011727508302172751, - 0.013890752469918237, - 0.012950109439490815, - 0.015828153615401713, - 0.011233498318616135, - 0.013063106813702607, - 0.013101045053120094, - 0.012854514904197168, - 0.014225441730661032 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis." - } - ], - [ - "Fraction of Lines Starting with Bullet Point", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.05958846605103529, - 0.06540916901994907, - 0.10871161367473074, - 0.057639202535687495, - 0.05391125998418046, - 0.048856823399157104, - 0.0919025139411848, - 0.06361059519326412, - 0.08348033701472354, - 0.09887120370776314, - 0.0654760782941809, - 0.07275273301463199, - 0.08648053868877607, - 0.0728023788334523, - 0.059507615068158916, - 0.08230576538579888, - 0.06015758928408362 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point." - } - ], - [ - "Number of Lines with Toxic Words", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.18993986988604764, - 0.8879124595391081, - 0.25990565781393504, - 0.26195250922877383, - 0.25880866508924677, - 1.059369135555999, - 0.13686689542609656, - 0.41953855946078195, - 0.8275813030364594, - 0.15215921815095176, - 0.13490615752563948, - 0.7103062970637767, - 0.10924031432867846, - 0.983178192635831, - 0.1341482674752835, - 0.14871528552339636, - 0.44260888838506973 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Personal Development & Human Resources & Career in average has more lines with toxic words." - } - ], - [ - "Number of Toxic Words", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.2548288848495254, - 1.5926663123822407, - 0.4235181592404802, - 0.38067444876783396, - 0.32550136684202063, - 2.0770772609303254, - 0.20720185235163838, - 0.590086129190106, - 1.571774836299989, - 0.20227609302517582, - 0.18648727331689754, - 1.453566228207665, - 0.15104924932374938, - 2.337839174591593, - 0.20351242932499447, - 0.24778267732040007, - 0.7902395942075084 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Daily Life & Home & Lifestyle in average has more toxic words." - } - ], - [ - "Word Count", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 524.2469683001206, - 634.8099570027538, - 332.5969724636027, - 654.5120023944927, - 634.4970021364086, - 747.0358714314234, - 624.2853688210322, - 570.2685052667209, - 746.3173279782462, - 427.6056492102561, - 603.5602799243807, - 470.1159928294108, - 559.1577497339609, - 450.07929463845727, - 682.6580435263274, - 559.7302638435485, - 514.1515901604118 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics." - } - ], - [ - "Mean Word Length", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 4.851116616082301, - 5.17314698008811, - 4.951553714759433, - 4.8636771295932055, - 5.165523097115738, - 4.64498800138652, - 5.233981234962708, - 5.094122002544284, - 5.191578081429402, - 4.872407702558401, - 5.077044932121297, - 4.911569182027774, - 5.25771470252484, - 4.990336313339119, - 5.138998450653204, - 5.165914329275205, - 4.943227231080612 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general." - } - ], - [ - "Number of Sentences", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 23.816802839664227, - 28.88356925455336, - 17.167653020743327, - 32.65256909109049, - 26.743545978727802, - 41.80899580754642, - 28.010818950191275, - 25.435358957472562, - 35.18096235290441, - 22.968376703457743, - 28.56101327024348, - 22.844366731397383, - 26.802678452904015, - 22.603309910218957, - 30.825168198616506, - 26.01027691753675, - 27.965867004150304 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences." - } - ], - [ - "Symbol to Word Ratio", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.0029508316364481296, - 0.002339527014691741, - 0.002746622681352375, - 0.0031207893125393786, - 0.0024594503072570637, - 0.003732116125668388, - 0.0029521717963945683, - 0.002009846839273012, - 0.0023335875319153666, - 0.0032912280108721562, - 0.0026740153080243275, - 0.0037401276658117497, - 0.0022685436825723537, - 0.0034624173472893424, - 0.0022837896768252673, - 0.002565854536163215, - 0.0035536009817103663 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Documents related to Entertainment & Travel & Hobby usually have higher percentage of symbols." - } - ], - [ - "Fraction of Words with Alpha Character", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.9554513833362817, - 0.9672667625084445, - 0.945038227724378, - 0.9650443058450766, - 0.9662993498435797, - 0.9795101768513954, - 0.949647348401343, - 0.9644024275136092, - 0.9651040360235426, - 0.9515637138100507, - 0.9638773263904938, - 0.9544175710037947, - 0.9602638724414636, - 0.9533095901329957, - 0.9536863995733356, - 0.9573400271816177, - 0.9613916720605239 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "The fraction of words with alpha character seems to be relatively consistent across different topics." - } - ], - [ - "Number of Stop Words", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 106.35206163056745, - 139.14766896951542, - 61.915719712408496, - 150.11113937942733, - 141.82980634490363, - 156.21242563385906, - 122.75635942647355, - 122.98374178167114, - 152.60597915266686, - 83.42474857711936, - 128.65106740517174, - 93.49815985637109, - 114.57335387655353, - 86.25348147704472, - 162.30932752139992, - 114.21801717978818, - 106.4132116046095 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Culture & Cultural geography contains more stop words in average." - } - ], - [ - "Has Curly Bracket", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.005337434075224228, - 0.0067539494661577855, - 0.01028199401122759, - 0.009842362566097974, - 0.011575658725965403, - 0.00931123976841685, - 0.02773600990434086, - 0.006582021405366427, - 0.009203144290424917, - 0.01040779785032792, - 0.008158933368710822, - 0.007557345036429687, - 0.010752547772779798, - 0.011963847997509762, - 0.012824157427587732, - 0.009383291815585807, - 0.008704979857573332 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data." - } - ], - [ - "Number of Document Duplication", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 12.971086410444947, - 7.7029131842117975, - 6.170992495013885, - 7.104888755861518, - 8.650198708966025, - 6.623561589139549, - 6.508078335111145, - 9.093410252353744, - 6.089149697604276, - 7.057779532190901, - 7.702302116384568, - 6.5227466722033824, - 6.954972482095409, - 6.535254918082626, - 9.99308253577182, - 5.590145547903439, - 6.865564317873041 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Sports related documents have a higher number of duplication count." - } - ], - [ - "Number of Dump Duplication", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 3.8719109133730716, - 3.26455384318083, - 2.2721848150382313, - 3.265644018756859, - 3.444853093197951, - 3.1923417847873825, - 2.7517906637022187, - 3.3698330764051643, - 2.710181437434559, - 3.1639266861716124, - 3.206342888940481, - 2.7590002157690536, - 2.8303421621304015, - 2.6106544768459656, - 3.9413752803310276, - 2.4888664878823907, - 3.094817449212491 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others." - } - ], - [ - "Number of Year Duplication", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 1.4484135130393887, - 1.4291463355717668, - 1.229893865040677, - 1.4503641624264192, - 1.452872665456801, - 1.442735076861649, - 1.3276539735170263, - 1.4222795353953812, - 1.328097016674741, - 1.406236938652301, - 1.4158586964132385, - 1.3305229074812153, - 1.3344910656577857, - 1.2914712093532734, - 1.5438579866704571, - 1.2835525414227675, - 1.404339123681775 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics." - } - ], - [ - "Maximum Span of Year Duplication", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 1.4810222492619038, - 1.4615875163051355, - 1.2437028221138953, - 1.4792776613788288, - 1.49291539362753, - 1.4697025354362148, - 1.3531216666024084, - 1.4549983259811405, - 1.3486599259247682, - 1.432375114165854, - 1.4460216908228583, - 1.352992743917416, - 1.3557807185339452, - 1.309416527589089, - 1.582425218737168, - 1.3025537295236642, - 1.4325029904862776 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years." - } - ], - [ - "Language Score", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.9413726660750416, - 0.9347130364355554, - 0.8847180050664069, - 0.9336572405289453, - 0.9420075430577804, - 0.9522977107155225, - 0.8831956938165678, - 0.9481278901144439, - 0.9241279717677588, - 0.9066709862541587, - 0.9270825804900252, - 0.9117954084131167, - 0.921528771738386, - 0.8992133008305735, - 0.9224377655046135, - 0.9152426551412108, - 0.9178671893764959 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved." - } - ], - [ - "Fraction of Duplicate Lines", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.01235014200385377, - 0.012056246914591116, - 0.018525258494333133, - 0.012726935235443207, - 0.01165333793386552, - 0.010444387257042395, - 0.016149995700960602, - 0.012705431934865763, - 0.01519943556613772, - 0.014809953345215319, - 0.012686293057054212, - 0.01603496888664195, - 0.01596207137084465, - 0.016014032499666292, - 0.013610478505124169, - 0.01580386009616988, - 0.015060041023072804 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others." - } - ], - [ - "Fraction of Characters in Duplicate Lines", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.00501345725991589, - 0.004299959504074716, - 0.0073601226054879785, - 0.004651424152553605, - 0.00450348053495509, - 0.003909584113418541, - 0.0063485903557626774, - 0.00521503913729261, - 0.005782503341128245, - 0.005962335751386622, - 0.004749891704712697, - 0.006420052544922626, - 0.0063561887111620065, - 0.006466672218067342, - 0.004978436253072214, - 0.006108371322424041, - 0.0057332990952240126 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - }, - "comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others." - } - ], - [ - "Fraction of Characters in Most Common Bigram", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.02614438964212445, - 0.02549244163757135, - 0.03593997020714517, - 0.026520648762908574, - 0.023796693998532542, - 0.019517664362790295, - 0.03146900938445295, - 0.026900122790576828, - 0.027486920194835916, - 0.029735671266585457, - 0.02724062185263462, - 0.030402249730981233, - 0.03031798250174187, - 0.034936591389516845, - 0.02730012031746535, - 0.029329317288923955, - 0.028440195287636943 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Most Common 3-gram", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.025877442339206684, - 0.026073122171118526, - 0.03794965393907832, - 0.02756936824343807, - 0.024589084236341825, - 0.019970321326854976, - 0.031104349287997282, - 0.027138921074492478, - 0.02674544851177018, - 0.03082668946385283, - 0.027642774270487825, - 0.031311152209273344, - 0.030596143210215625, - 0.0352048856850328, - 0.028135774349846692, - 0.029182052353507664, - 0.03023015052666528 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Most Common 4-gram", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.026649882448510714, - 0.026904261739744192, - 0.04086332064828129, - 0.0286899321496711, - 0.025495383586610822, - 0.020748509542508307, - 0.03171918073481819, - 0.027563495776633813, - 0.02693813171261885, - 0.03243470539147362, - 0.0287992899739741, - 0.032589127100319, - 0.031139178077804624, - 0.03630423964958027, - 0.029809457289325606, - 0.029522356378146167, - 0.032375986416410006 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Duplicate 5-grams", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.016989093741368057, - 0.01874268974292254, - 0.02001249239006167, - 0.01893345653851295, - 0.017576185062959156, - 0.013966567341084396, - 0.026648000310062814, - 0.021239561601745963, - 0.022547189937081085, - 0.016903077473431387, - 0.018277127900190513, - 0.019079382613460993, - 0.023467347573746446, - 0.021192854307303135, - 0.019157826340526964, - 0.021183180653813184, - 0.016589870490142093 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Duplicate 6-grams", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.011946923836249373, - 0.013297904416841108, - 0.014258151562338789, - 0.013153098583726782, - 0.012601072651000291, - 0.009837626317910313, - 0.01949595975959962, - 0.014924056163499448, - 0.015889641140216917, - 0.011840004108930956, - 0.012940087820238557, - 0.013424858515603134, - 0.016468963654372438, - 0.014839192401791004, - 0.01388493355575309, - 0.01498376261489033, - 0.011812586464028073 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Duplicate 7-grams", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.009095872215498261, - 0.010146405377015994, - 0.010487557518535542, - 0.009868429864354638, - 0.009802808055168035, - 0.007541438580109868, - 0.015129318997269138, - 0.011302686364124783, - 0.011969695536420487, - 0.00892604076557906, - 0.009759568234633746, - 0.010070709856859254, - 0.012419860047704056, - 0.011070038486862109, - 0.010547925069683646, - 0.011327481696653985, - 0.008957792606056945 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Duplicate 8-grams", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.0073128979182685684, - 0.008109486840255576, - 0.00813393016693516, - 0.007792984494504855, - 0.008002590936702558, - 0.006117199534770664, - 0.012284551039331444, - 0.009023639757214827, - 0.009496488981527608, - 0.007086539674993228, - 0.0076650824522217454, - 0.007877075837565403, - 0.00987496717434344, - 0.008652258777583252, - 0.008392133389867372, - 0.00901948167673584, - 0.007053229339496676 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Duplicate 9-grams", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.006148310898840968, - 0.00676529697013875, - 0.00643290688721836, - 0.006434352312383364, - 0.0067735701471297, - 0.005172565516416477, - 0.010288525380088334, - 0.007482544617336476, - 0.00780204339328974, - 0.005852660603046196, - 0.006240040171999708, - 0.006465362460507409, - 0.008165651028577293, - 0.006986331812620781, - 0.006928899525750178, - 0.007487698229960434, - 0.005728220555701086 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ], - [ - "Fraction of Characters in Duplicate 10-grams", - { - "type": "barh", - "kwargs": { - "y": [ - "Sports", - "Society & Social Issues & Human Rights", - "Shopping & Commodity", - "Religion & Spirituality", - "Politics & Government", - "Personal Development & Human Resources & Career", - "Natural Science & Formal Science & Technology", - "Law & Justice", - "Health & Wellness & Medicine", - "Food & Drink & Cooking", - "Environment", - "Entertainment & Travel & Hobby", - "Education", - "Daily Life & Home & Lifestyle", - "Culture & Cultural geography", - "Business & Economics & Finance", - "Arts" - ], - "width": [ - 0.005325364079705381, - 0.005797357629820572, - 0.005283647214644124, - 0.005467491111249268, - 0.005879825006312822, - 0.004529332536092203, - 0.008882676950579147, - 0.006399831899960353, - 0.006645377495475746, - 0.005021569571200667, - 0.005306020206939719, - 0.00550360123328725, - 0.007013864844056383, - 0.005835955446724545, - 0.005845555947354781, - 0.00641447975612288, - 0.004809876486057196 - ], - "color": [ - [ - 1.0, - 0.4980392156862745, - 0.054901960784313725, - 1.0 - ], - [ - 1.0, - 0.7333333333333333, - 0.47058823529411764, - 1.0 - ], - [ - 0.17254901960784313, - 0.6274509803921569, - 0.17254901960784313, - 1.0 - ], - [ - 0.596078431372549, - 0.8745098039215686, - 0.5411764705882353, - 1.0 - ], - [ - 0.8392156862745098, - 0.15294117647058825, - 0.1568627450980392, - 1.0 - ], - [ - 1.0, - 0.596078431372549, - 0.5882352941176471, - 1.0 - ], - [ - 0.5803921568627451, - 0.403921568627451, - 0.7411764705882353, - 1.0 - ], - [ - 0.7725490196078432, - 0.6901960784313725, - 0.8352941176470589, - 1.0 - ], - [ - 0.5490196078431373, - 0.33725490196078434, - 0.29411764705882354, - 1.0 - ], - [ - 0.7686274509803922, - 0.611764705882353, - 0.5803921568627451, - 1.0 - ], - [ - 0.8901960784313725, - 0.4666666666666667, - 0.7607843137254902, - 1.0 - ], - [ - 0.9686274509803922, - 0.7137254901960784, - 0.8235294117647058, - 1.0 - ], - [ - 0.4980392156862745, - 0.4980392156862745, - 0.4980392156862745, - 1.0 - ], - [ - 0.7803921568627451, - 0.7803921568627451, - 0.7803921568627451, - 1.0 - ], - [ - 0.7372549019607844, - 0.7411764705882353, - 0.13333333333333333, - 1.0 - ], - [ - 0.8588235294117647, - 0.8588235294117647, - 0.5529411764705883, - 1.0 - ], - [ - 0.09019607843137255, - 0.7450980392156863, - 0.8117647058823529, - 1.0 - ] - ] - }, - "x_label": "Metrics", - "subplots_adjust": { - "left": 0.37, - "right": 0.98 - } - } - ] + [ + "Number of Document of Each Topic", + { + "type": "pie", + "kwargs": { + "x": [ + 324853053, + 127033069, + 233531055, + 123094708, + 267497100, + 148588074, + 581871647, + 165387460, + 390492627, + 244588996, + 170281196, + 914696921, + 281274506, + 686870899, + 19458015, + 1411221902, + 366116749 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + }, + "comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics." + } + ], + [ + "Fraction of Words Corrected in Lines", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.005660839019938058, + 0.005601641737204916, + 0.010656858389603374, + 0.006108459524594901, + 0.005077341851036456, + 0.004333818728677237, + 0.00812686384284095, + 0.005099914065389049, + 0.005922873834475705, + 0.008028764588273587, + 0.005868815973653353, + 0.007446294346393395, + 0.006845364607248323, + 0.007812665071102337, + 0.007692180748283549, + 0.006834288663313659, + 0.007850315335340054 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines." + } + ], + [ + "Fraction of Lines Ending with Ellipsis", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.013698353704877283, + 0.011988367184873385, + 0.010239788510367555, + 0.013182844498032174, + 0.012825014289657984, + 0.016784713501187303, + 0.013729740175749594, + 0.012272497721678627, + 0.011805768817329271, + 0.013464839491767208, + 0.012785021526251267, + 0.015677345947523093, + 0.011127706885026923, + 0.012810749078485683, + 0.013244961193298873, + 0.012872493046687979, + 0.014188113777531883 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis." + } + ], + [ + "Fraction of Lines Starting with Bullet Point", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.05924759002174845, + 0.06636489569195865, + 0.1111156447572103, + 0.057900707172324956, + 0.05498350949228654, + 0.04950217629831486, + 0.09247477225558454, + 0.06597399742617387, + 0.08548870827846955, + 0.09873316891194645, + 0.06547543788491705, + 0.0735152711822082, + 0.08847503034590092, + 0.07390893089349196, + 0.058802087892367495, + 0.08333351946410401, + 0.06067125030474924 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point." + } + ], + [ + "Number of Lines with Toxic Words", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.19922507238988454, + 0.8737279739340943, + 0.2723972492651994, + 0.2602153376081773, + 0.26610157268994694, + 1.0438407391968751, + 0.13075538461491662, + 0.43004878362603793, + 0.7763741362522576, + 0.15141952256920013, + 0.1365380766999076, + 0.7216673095153012, + 0.10996786534219351, + 1.0588212632953606, + 0.13198319561373553, + 0.1422862362860352, + 0.45226715918424154 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Daily Life & Home & Lifestyle in average has more lines with toxic words." + } + ], + [ + "Number of Toxic Words", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.2733295968131166, + 1.5669363935464709, + 0.44824271872535326, + 0.372924147153426, + 0.3350402266043258, + 2.0669407694186814, + 0.19540095755860054, + 0.5992540667835397, + 1.458653655962626, + 0.1993768722121906, + 0.18759733165134687, + 1.4411351199901983, + 0.1523272713524915, + 2.455736465842033, + 0.18513111435056454, + 0.22537730852195914, + 0.781900764665645 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Daily Life & Home & Lifestyle in average has more toxic words." + } + ], + [ + "Word Count", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 528.940000185253, + 630.8575219496587, + 331.12940931560473, + 652.5414443568118, + 639.8122070257958, + 745.046993327338, + 616.6008528251936, + 561.0260046983005, + 739.9628602078574, + 427.55716142683707, + 611.8977898886733, + 470.5665158383101, + 557.2392759050832, + 448.4545774765747, + 666.4168803960733, + 555.4812271358867, + 506.7156364621822 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics." + } + ], + [ + "Mean Word Length", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 4.8591533713129555, + 5.180746132747496, + 4.95975994177285, + 4.875042818402709, + 5.168670579970495, + 4.654983410185081, + 5.237515458154388, + 5.1004147156966715, + 5.205703704499496, + 4.880593401877592, + 5.08581294318828, + 4.914944728270949, + 5.264151733240911, + 4.9967250103431935, + 5.143653278714547, + 5.172399304913307, + 4.948735274753513 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general." + } + ], + [ + "Number of Sentences", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 23.889495217396032, + 28.667429824906456, + 17.00780648209721, + 32.56295546840243, + 27.016352794105057, + 41.47359763206837, + 27.639144560346658, + 24.82906568611671, + 34.94192956170719, + 22.905121737365487, + 28.89842860864097, + 22.790945701674666, + 26.620081949410658, + 22.43558780468875, + 30.96235911011478, + 25.786870409555195, + 27.291027835495175 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences." + } + ], + [ + "Symbol to Word Ratio", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.0029673437245102876, + 0.002369316991198444, + 0.0026953845515368074, + 0.0030995856207761256, + 0.002515345366978788, + 0.003716508288521279, + 0.002910243583180489, + 0.0021063347433407133, + 0.0023350751882016177, + 0.00325952936332765, + 0.0026651973287582483, + 0.0037352572697365097, + 0.002278588397824893, + 0.0033945285429091187, + 0.002321581720070917, + 0.002557382224711868, + 0.0035588078008559885 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Documents related to Entertainment & Travel & Hobby usually have higher percentage of symbols." + } + ], + [ + "Fraction of Words with Alpha Character", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.9543530503658745, + 0.966243732434154, + 0.9437721848599528, + 0.9641198106485631, + 0.9655064629815291, + 0.9789937421507563, + 0.9480065698252734, + 0.9637242361370412, + 0.9640004505795688, + 0.950377474345678, + 0.9627294216362635, + 0.9531905135921064, + 0.9586824669836848, + 0.9522644098234544, + 0.9526614781429045, + 0.9564103310368344, + 0.9600895447178572 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "The fraction of words with alpha character seems to be relatively consistent across different topics." + } + ], + [ + "Number of Stop Words", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 107.18473168851517, + 138.47942025237538, + 61.60918090315654, + 149.6609807547535, + 142.7609030677342, + 156.05911428665533, + 121.44399604677766, + 120.49043039901574, + 151.07195976327614, + 83.29154811200092, + 130.22922543954883, + 93.5245517591504, + 114.12362959051823, + 85.78707441498406, + 158.3108779081525, + 113.40855796468499, + 104.68147387870529 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Culture & Cultural geography contains more stop words in average." + } + ], + [ + "Has Curly Bracket", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.005196103236253101, + 0.006631383517940514, + 0.010188400853154199, + 0.00940792678106032, + 0.011743005812025626, + 0.009386123411223433, + 0.027600317153793196, + 0.0069009343271853865, + 0.009196375940793372, + 0.010313546566910966, + 0.007981867827613802, + 0.007428252838734548, + 0.010558184039615734, + 0.011781598276738173, + 0.01233861727416697, + 0.00927681818248878, + 0.00859682603594844 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data." + } + ], + [ + "Number of Document Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 7.537686235012851, + 7.1025928374603, + 5.966075852310092, + 7.145161033242794, + 7.434088750868701, + 6.676430767922868, + 6.555241123477528, + 7.261966584407307, + 6.24676702026438, + 6.911925072867955, + 7.12188790945537, + 6.103017919746556, + 6.569012504105153, + 5.991152694037777, + 8.832861317045957, + 6.079587144899626, + 6.927007100677604 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Culture & Cultural geography related documents have a higher number of duplication count." + } + ], + [ + "Number of Dump Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 3.192987467475025, + 3.256401260367881, + 2.059270656744132, + 3.2623413672665764, + 3.4332213994095637, + 3.184122448481296, + 2.718956481479841, + 3.2133479527407944, + 2.675923289583647, + 3.1140861341121004, + 3.18235214298119, + 2.64155309428444, + 2.727249994708017, + 2.398892457664013, + 3.9119670737225767, + 2.418408296500489, + 3.0585330664563504 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others." + } + ], + [ + "Number of Year Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 1.5100191962794944, + 1.5532559399946482, + 1.286292351995755, + 1.5752399851340482, + 1.583747083613243, + 1.5616323218510795, + 1.4345138525025949, + 1.5322301884314566, + 1.4292428061644298, + 1.514165600483515, + 1.5310395693955543, + 1.420375959699989, + 1.4311913003590877, + 1.3601090413935268, + 1.6918824453573502, + 1.372621704109578, + 1.5093211072952033 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics." + } + ], + [ + "Maximum Span of Year Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 1.6027068922144314, + 1.6463481331778262, + 1.326032869589871, + 1.6654945962421066, + 1.7014688495688364, + 1.6394770282842483, + 1.5158579087803534, + 1.622844253125358, + 1.4877078793090759, + 1.5888212812321287, + 1.6172138231869126, + 1.4887702131010017, + 1.492519460686565, + 1.41459098414941, + 1.822498749230073, + 1.4288667063218525, + 1.592900066967436 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years." + } + ], + [ + "Language Score", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.940489806219048, + 0.9341014072001546, + 0.8833469805761418, + 0.9326888501156927, + 0.9414304493962583, + 0.9514325652491805, + 0.8825959914278214, + 0.9474163424125213, + 0.9228861253995115, + 0.9051492112749342, + 0.9259433469236898, + 0.9106329146251756, + 0.9205018098890236, + 0.8984234924235204, + 0.922120043098531, + 0.9144863004649139, + 0.9163656720680041 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved." + } + ], + [ + "Fraction of Duplicate Lines", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.012600997185367828, + 0.01188115899050692, + 0.019039766660027862, + 0.0124898927764339, + 0.011646428662052831, + 0.010610017211082174, + 0.0159476139009855, + 0.012597314331886177, + 0.015094734040349217, + 0.014975673115722092, + 0.012534733023571196, + 0.01610487136667016, + 0.015238474263765327, + 0.01591887690664154, + 0.01433554300372473, + 0.015517507810570494, + 0.015401894047378658 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others." + } + ], + [ + "Fraction of Characters in Duplicate Lines", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.004938858081287135, + 0.004205415457276368, + 0.007552357256944613, + 0.004582746666516553, + 0.0044753076683352235, + 0.003940081675446834, + 0.006179645047614322, + 0.0050437770645133185, + 0.005686946797304247, + 0.005994693646977406, + 0.0046510979989690445, + 0.006342709242367984, + 0.005829011670104205, + 0.006381457735701225, + 0.005068730018848793, + 0.005971977053954138, + 0.005856182489324971 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others." + } + ], + [ + "Fraction of Characters in Most Common Bigram", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.026404273123764796, + 0.025634991525195914, + 0.03614706934891397, + 0.026733992014165146, + 0.02377814829063671, + 0.019649114365205896, + 0.03137691450183766, + 0.0270495750357038, + 0.027673178183087933, + 0.029942339233414595, + 0.027350265679715224, + 0.030526314564882247, + 0.030614040432541026, + 0.03509742016691783, + 0.027540083176404263, + 0.029519105783701725, + 0.028834560229748462 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Most Common 3-gram", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.026165437832849362, + 0.026251631875192687, + 0.03811434473394529, + 0.027814565987299922, + 0.02458896408514931, + 0.020185288853227328, + 0.031209395373852387, + 0.027345772022684685, + 0.026970288190643604, + 0.030974020712503342, + 0.027787662286871063, + 0.03143649261443422, + 0.030952890587447934, + 0.035409395984874435, + 0.028486665111510972, + 0.029371087024795153, + 0.030651728333515618 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Most Common 4-gram", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.026986818387229945, + 0.027131471441621514, + 0.04096520359337113, + 0.02892589321318727, + 0.025567532544329325, + 0.02099809142740805, + 0.03184294279840072, + 0.027798282452682368, + 0.027173562606014456, + 0.03262575837410923, + 0.028962796310066586, + 0.03275942719001153, + 0.03150508247840716, + 0.03652360609065789, + 0.029967601450189167, + 0.029689817413511087, + 0.032849409839897196 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 5-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.017172412387336047, + 0.018706184899018846, + 0.020367380081370155, + 0.01912555365329135, + 0.017726236260087368, + 0.014196179855798982, + 0.026453126704962582, + 0.02113702754713442, + 0.022750991771259714, + 0.017193015331520775, + 0.018542560337896252, + 0.019254844137973823, + 0.022992720412462874, + 0.02142410388811584, + 0.019425070816460523, + 0.021273316544081922, + 0.016721728689196018 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 6-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.011970605317388902, + 0.013247085951653395, + 0.014520268748996269, + 0.013346559736076019, + 0.012676990872510209, + 0.009987587475557972, + 0.019267682560495096, + 0.014804574416538653, + 0.01599622943881697, + 0.012044805932442022, + 0.013103480807140754, + 0.013565938336254593, + 0.015985684726478346, + 0.014952398432378033, + 0.014038548162484649, + 0.01499523284606334, + 0.01191912942692566 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 7-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.009039855940169648, + 0.010073325457247666, + 0.010696329689399887, + 0.009986471165253209, + 0.009836605907258674, + 0.007629540534323915, + 0.014862665767002146, + 0.011114675308487159, + 0.011994499294618745, + 0.00907677139620476, + 0.009828884274472392, + 0.010105882592285087, + 0.011911397850057279, + 0.011114070775684791, + 0.010705940851157975, + 0.011289206913404862, + 0.009020791758628242 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 8-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.007195970331280814, + 0.008036785010913946, + 0.008249465083499188, + 0.007850453980445486, + 0.007992494679173406, + 0.006182227979778321, + 0.011985128922160077, + 0.008779317883993009, + 0.009467708596743243, + 0.0072104637314519765, + 0.007673480063984403, + 0.007904073310509803, + 0.00934269422506397, + 0.008657166799636231, + 0.008485033120385031, + 0.00893995298657962, + 0.0070980585511939785 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 9-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.006007801998269901, + 0.0066553421544330435, + 0.006529544209051376, + 0.006431532192110704, + 0.006749247730086534, + 0.0052160144431644155, + 0.009999607112669078, + 0.007230897967718682, + 0.007732210045591141, + 0.005943279041721623, + 0.006205408840055294, + 0.006469113514028088, + 0.007626168747361047, + 0.006984803950948357, + 0.006992627523875565, + 0.007390774121782952, + 0.005766236221861412 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 10-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.0051710981353331464, + 0.005682713875756294, + 0.005363391346741175, + 0.005464661863661183, + 0.005846752796603754, + 0.0045485380381742845, + 0.00859601801316329, + 0.006136039855302813, + 0.0065548495409889435, + 0.005091836417990565, + 0.005250172827665216, + 0.005493532455475418, + 0.0064690246645603714, + 0.00581859783771439, + 0.005913783298441542, + 0.006306495977016168, + 0.004847123500711834 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Number of Document of Each Topic in Duplication Bucket 1-1", + { + "type": "pie", + "kwargs": { + "x": [ + 132249226, + 47101525, + 108551234, + 42778158, + 106867576, + 52904902, + 254436283, + 65155001, + 160648797, + 96650903, + 67150855, + 409977727, + 110689452, + 314681138, + 6571908, + 632953103, + 144137883 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + } + } + ], + [ + "Number of Document of Each Topic in Duplication Bucket 2-5", + { + "type": "pie", + "kwargs": { + "x": [ + 104341527, + 43192514, + 75077276, + 41770802, + 83866134, + 49842746, + 190845342, + 53891858, + 130879713, + 77844628, + 54343851, + 284749532, + 98624858, + 215519319, + 6476944, + 461711335, + 117251313 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + } + } + ], + [ + "Number of Document of Each Topic in Duplication Bucket 6-10", + { + "type": "pie", + "kwargs": { + "x": [ + 38443961, + 16376927, + 24393877, + 18121006, + 33219823, + 22183319, + 61039668, + 20168641, + 46703585, + 32303976, + 22128963, + 101878274, + 33435189, + 76066340, + 2712172, + 150381514, + 48762315 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + } + } + ], + [ + "Number of Document of Each Topic in Duplication Bucket 11-100", + { + "type": "pie", + "kwargs": { + "x": [ + 47907124, + 19868534, + 24683580, + 19990446, + 42239133, + 23300618, + 73293979, + 25410832, + 51086693, + 36943522, + 25974730, + 115373947, + 37581053, + 78644079, + 3587872, + 162190029, + 54743888 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + } + } + ], + [ + "Number of Document of Each Topic in Duplication Bucket 101-1000", + { + "type": "pie", + "kwargs": { + "x": [ + 1879583, + 484835, + 792159, + 425638, + 1281577, + 350055, + 2198979, + 744933, + 1141913, + 825604, + 669062, + 2659131, + 922347, + 1892942, + 106472, + 3840419, + 1189540 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + } + } + ], + [ + "Number of Document of Each Topic in Duplication Bucket 1001-30000000", + { + "type": "pie", + "kwargs": { + "x": [ + 31632, + 8734, + 32929, + 8658, + 22857, + 6434, + 57396, + 16195, + 31926, + 20363, + 13735, + 58310, + 21607, + 67081, + 2647, + 145502, + 31810 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + } + } + ] ] \ No newline at end of file