yonkasoft commited on
Commit
dab5364
1 Parent(s): 2c996b2

Upload datasets.ipynb

Browse files
Files changed (1) hide show
  1. datasets.ipynb +371 -1
datasets.ipynb CHANGED
@@ -437,6 +437,359 @@
437
  "TF-IDF HESAPLAMA"
438
  ]
439
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  {
441
  "cell_type": "code",
442
  "execution_count": 20,
@@ -484,6 +837,7 @@
484
  "from tqdm.auto import tqdm, trange\n",
485
  "import tensorflow as tf\n",
486
  "import nltk\n",
 
487
  "from nltk.stem import WordNetLemmatizer\n",
488
  "from nltk.corpus import stopwords\n",
489
  "\n",
@@ -513,16 +867,32 @@
513
  " document_count = len(combined_text)\n",
514
  " return combined_text, document_count\n",
515
  " \n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  " # Calculate TF-IDF and get feature names\n",
517
  " @staticmethod\n",
518
  " def calculate_tfidf(documents, stop_words):\n",
519
  " vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000,min_df=2)\n",
 
520
  " tfidf_matrix = vectorizer.fit_transform(documents)\n",
521
  " feature_names = vectorizer.get_feature_names_out()\n",
522
  " return tfidf_matrix, feature_names\n",
523
  "\n",
524
  " # Extract keywords using TF-IDF\n",
525
- " def extract_keywords(tfidf_matrix, feature_names, top_n=10, stop_words=[]):\n",
526
  " keywords = {}\n",
527
  " for doc_idx, row in enumerate(tfidf_matrix):\n",
528
  " filtered_feature_names = [name for name in feature_names if name.lower() not in stop_words]\n",
 
437
  "TF-IDF HESAPLAMA"
438
  ]
439
  },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 13,
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "name": "stdout",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
450
+ "Average Embedding Shape: torch.Size([768])\n",
451
+ "Average Embedding: tensor([ 3.1219e-01, -3.4488e-02, 1.1118e-02, -3.6194e-02, 1.3312e-02,\n",
452
+ " 8.7684e-02, 6.0835e-01, -5.8831e-03, 4.2102e-01, 3.7467e-01,\n",
453
+ " -1.9954e-01, 2.5975e-01, -8.9819e-02, 6.8351e-02, -2.3226e-01,\n",
454
+ " -6.4409e-02, 1.1375e-01, 6.9892e-02, 3.4909e-01, -2.0660e-01,\n",
455
+ " 4.2718e-02, -2.3758e-01, -1.2146e-01, 6.9431e-01, 8.2034e-02,\n",
456
+ " -4.4726e-01, -3.9995e-01, 4.9043e-01, -5.3700e-01, 4.0028e-02,\n",
457
+ " 2.4516e-02, -1.9234e-01, -5.9901e-02, 4.0203e-01, 1.7956e-01,\n",
458
+ " 2.7692e-01, 4.2539e-01, -1.0046e-01, -1.9326e-01, -2.3722e-01,\n",
459
+ " 3.9989e-01, 1.1785e-01, -3.7475e-01, -4.7698e-01, 1.2440e-01,\n",
460
+ " 1.7583e-01, 4.7179e-01, -6.1670e-01, 3.4876e-01, -1.1977e-01,\n",
461
+ " 4.3870e-01, -4.7105e-01, 3.8414e-01, 3.6902e-01, -1.2121e-02,\n",
462
+ " -3.3284e-02, 2.5584e-01, 2.0225e-01, 1.4411e-01, 2.9933e-01,\n",
463
+ " 3.6910e-01, 2.3893e-01, 6.0434e-01, 1.5669e-01, -8.5170e-01,\n",
464
+ " -2.5171e-01, 3.6258e-01, 4.5186e-01, -2.9369e-01, 3.8370e-01,\n",
465
+ " 4.9858e-01, -7.5623e-02, 1.1362e-02, -1.3621e-01, -2.7373e-01,\n",
466
+ " -3.1269e-01, -6.4951e-01, -6.9747e-02, 2.1302e-01, 3.4201e-01,\n",
467
+ " -3.8148e-01, -3.2749e-02, 7.4201e-01, -6.0619e-01, -1.8069e-01,\n",
468
+ " -1.5151e-01, 7.6336e-02, -4.0224e-02, -5.9742e-01, -1.7219e-02,\n",
469
+ " -5.6787e-01, 2.6290e-01, 2.3984e-01, 4.8434e-01, 4.7557e-01,\n",
470
+ " 2.2432e-01, -1.0822e-01, 3.5924e-01, -4.4102e-01, -1.1613e+00,\n",
471
+ " 5.3896e-02, -2.8951e-01, -1.0792e+00, -2.2577e-02, -2.9868e-01,\n",
472
+ " -2.7837e-01, 1.0477e-01, 3.8852e-01, 2.9142e-01, -4.2427e-01,\n",
473
+ " 3.6626e-01, 7.9898e-02, 2.2686e-01, 2.3253e-02, -6.9434e-01,\n",
474
+ " 3.2550e+00, -5.6280e-02, 1.1168e-01, 4.2853e-01, 7.7213e-02,\n",
475
+ " 3.1671e-01, -2.9387e-01, -2.1341e-01, -7.9131e-02, -1.0102e-01,\n",
476
+ " -5.7301e-01, 5.6494e-01, 2.0392e-01, -2.6974e-01, -9.0448e-01,\n",
477
+ " -7.6977e-01, 5.1432e-02, -1.3809e-01, 2.2806e-01, -3.8749e-01,\n",
478
+ " 4.0886e-01, 2.2627e-02, -2.4360e-02, -1.0032e-01, -8.8879e-03,\n",
479
+ " -2.9814e-01, 2.4151e-01, -6.5038e-01, 5.5605e-01, -1.5214e-02,\n",
480
+ " -4.4102e-01, 2.1589e-01, 8.9567e-02, -3.3454e-01, 4.1183e-01,\n",
481
+ " -2.5177e-02, -4.8496e-01, 3.7691e-01, 6.1995e-02, -2.9426e-01,\n",
482
+ " -1.5210e-01, 5.1504e-01, 4.9226e-01, 1.0083e-01, 1.9789e-01,\n",
483
+ " 6.5205e-01, -9.7679e-02, 3.4597e-02, 9.5440e-02, 6.5158e-01,\n",
484
+ " -5.6019e-01, -1.1912e-01, 1.9009e-01, 1.1314e-01, 1.0752e-01,\n",
485
+ " 4.7765e-01, 2.5196e-01, -1.5925e-01, 1.3468e-01, -1.9441e-01,\n",
486
+ " -5.0252e-02, 4.2977e-01, 2.7336e-01, 4.7672e-02, 2.3097e-01,\n",
487
+ " 1.5998e-01, -1.3434e-01, 3.8424e-01, -3.9759e-01, -2.6207e-02,\n",
488
+ " 2.9264e-02, -1.2846e-01, -3.9234e-01, -2.3295e-01, -1.4392e-01,\n",
489
+ " 7.9061e-02, 2.8095e-01, -1.6391e-01, 2.0505e-01, -1.2172e-01,\n",
490
+ " -2.5179e-01, 8.8469e-02, -1.5946e+00, -6.6211e-01, 1.6993e-01,\n",
491
+ " -1.6472e-02, 2.5519e-01, -2.4024e-02, 5.7010e-01, 6.1551e-03,\n",
492
+ " 7.0113e-02, -3.9507e-01, -2.2114e-02, -2.0259e-01, -8.9107e-03,\n",
493
+ " 1.1820e-01, -1.0522e-02, 5.2899e-01, -3.6007e-01, -5.6266e-01,\n",
494
+ " 1.3287e-01, -5.8443e-01, -2.5912e-01, -4.3816e-02, -1.1244e-01,\n",
495
+ " 1.0696e+00, 3.1219e-01, -4.1700e-01, 1.1373e-01, -2.2935e-01,\n",
496
+ " -1.4058e-02, 2.6080e-01, 6.1457e-03, 5.5064e-02, 5.2089e-01,\n",
497
+ " 1.3195e-01, -6.0868e-01, 4.0164e-01, -1.8374e-01, 8.4919e-01,\n",
498
+ " -4.2096e-01, -3.7411e-01, 1.8478e-02, -5.6272e-01, -2.5044e-01,\n",
499
+ " -1.1385e-01, 1.6000e-01, 3.3307e-01, -5.7846e-02, -4.1887e-02,\n",
500
+ " -1.7514e-01, 2.8522e-01, -3.3909e-01, 1.7133e-01, 2.4794e-02,\n",
501
+ " -3.0897e-01, 1.7487e-01, -4.8215e-01, -1.0892e-01, 1.0915e-01,\n",
502
+ " -2.9227e-02, -6.7439e-02, -3.6022e-01, -8.8648e-02, 2.5974e-01,\n",
503
+ " -2.2780e-02, 1.8174e-02, 8.9919e-02, 1.6508e+00, -6.3506e-01,\n",
504
+ " 4.9391e-01, 7.9321e-02, 3.2023e-02, 3.1216e-01, -7.8220e-02,\n",
505
+ " 3.5055e-01, -2.8349e-01, -4.8787e-01, -5.3590e-01, -4.5163e-01,\n",
506
+ " 2.4602e-01, 4.0553e-01, -2.9002e-01, -1.6120e-01, 1.3428e-02,\n",
507
+ " 4.7906e-01, 2.2494e-01, 3.5909e-01, 1.2861e-01, -1.7966e-01,\n",
508
+ " 9.8253e-02, -9.9344e-02, 2.3110e-01, 3.1276e-01, 6.4092e-02,\n",
509
+ " 2.7386e-01, -3.8601e-01, -5.6480e-01, -5.6070e-01, -6.4271e-02,\n",
510
+ " -2.8354e-01, 6.7687e-02, -5.7471e-01, 3.0518e-02, -1.3380e-02,\n",
511
+ " -3.6718e-01, 3.8880e-01, -1.9569e-01, 2.8110e-01, -2.9406e-01,\n",
512
+ " -2.5891e-01, -3.0043e-01, -3.3694e-01, 5.7723e-02, -1.2361e+00,\n",
513
+ " -1.1917e-01, -2.6665e-01, -5.6574e-02, -3.8907e-01, 4.2425e-01,\n",
514
+ " -6.5229e-02, 6.5768e-01, -1.0842e-01, -7.0508e-01, 8.4208e-02,\n",
515
+ " -3.7736e-01, 3.2153e-01, 5.6956e-01, 1.2256e-01, 4.2261e-01,\n",
516
+ " -2.7749e-01, 7.9419e-02, -8.1517e-02, -3.0462e-02, 1.5746e-01,\n",
517
+ " -8.7179e-02, 1.8869e-01, 4.1413e-01, 3.7192e-01, -1.9835e-01,\n",
518
+ " -2.5932e-01, 5.4023e-02, -3.8093e-01, 1.1605e-01, -1.4389e-01,\n",
519
+ " -4.5509e-01, -6.0786e-01, 4.2643e-01, 1.6004e-01, -3.4740e-02,\n",
520
+ " -4.4579e-01, -5.6887e-01, -1.1662e-01, 2.1577e-01, 6.6576e-03,\n",
521
+ " -2.3879e-01, 4.4046e-01, -2.6281e-01, 2.4404e-01, 8.1931e-02,\n",
522
+ " 2.2825e-01, -1.5294e-01, -3.7482e-01, 8.8104e-02, 4.0676e-01,\n",
523
+ " 1.6295e-01, 5.8565e-01, -8.0144e-02, -4.1792e-01, -4.6798e-01,\n",
524
+ " 3.9977e-01, -3.7319e-01, -1.2999e-01, -4.4200e-01, -2.9825e-01,\n",
525
+ " -1.2899e-01, -1.8651e-01, -2.0209e-02, -6.6213e-01, 5.0630e-02,\n",
526
+ " -4.6655e-01, -4.3796e-01, 6.7476e-02, 3.4367e-01, 1.8640e-01,\n",
527
+ " 3.3172e-01, -4.1092e-01, 2.6630e-02, -4.9168e-02, -3.4948e-01,\n",
528
+ " 1.6500e-02, -4.3398e-01, 2.6911e-01, 3.4227e-02, -2.1475e-01,\n",
529
+ " 9.7154e-01, -2.9554e-01, 8.5149e-01, -6.0231e-01, 1.0421e-01,\n",
530
+ " 6.2897e-01, 1.8700e-02, 1.6866e-01, -7.0568e-03, -6.9820e-01,\n",
531
+ " -1.3916e-01, 3.2686e-01, -1.5017e-01, 6.5600e-01, 2.9388e-02,\n",
532
+ " -6.0431e-01, 3.8548e-02, -1.2187e-01, -4.8818e-01, 1.5922e-01,\n",
533
+ " -2.1494e-02, -2.1316e-01, -1.5983e-01, -3.7928e-01, 5.6203e-01,\n",
534
+ " 3.1285e-01, -4.0310e-01, 3.8763e-01, -4.1886e-01, 1.6276e-01,\n",
535
+ " 1.2610e-01, 3.5952e-01, 1.3288e-01, 6.0504e-01, -3.4769e-01,\n",
536
+ " -1.5976e-01, 2.9626e-01, -2.2079e-01, -1.5934e-01, -5.8491e-01,\n",
537
+ " -5.7811e-02, -4.7510e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
538
+ " 9.2435e-02, 2.3198e-01, -5.8704e-01, -1.9506e-01, -5.3740e-01,\n",
539
+ " 1.8715e-01, -3.5691e-01, 2.5481e-01, 3.2795e-01, -9.4206e-02,\n",
540
+ " -2.2492e-01, -3.1406e-01, 4.5814e-01, -1.7896e-01, -3.9470e-01,\n",
541
+ " 1.9183e-01, -4.3177e-01, 2.7146e-01, 1.9477e-01, -1.7568e-02,\n",
542
+ " -2.0134e-01, 5.7984e-03, 3.0490e-01, -2.7846e-01, 9.8830e-03,\n",
543
+ " -3.0119e-01, -4.1994e-01, -1.0905e-02, 6.9638e-01, 9.4965e-02,\n",
544
+ " -2.6103e-01, 8.8206e-02, -1.0292e-01, -1.2342e-01, -2.2317e-03,\n",
545
+ " -5.2474e-02, -2.1636e-01, -1.6554e-01, 2.3173e-01, 1.2170e-01,\n",
546
+ " 4.5793e-01, -1.1033e-01, 1.4489e-01, 2.2540e-01, 5.2360e-01,\n",
547
+ " -3.6468e-01, -1.5081e-01, -2.3761e-02, 2.7475e-01, 5.3707e-01,\n",
548
+ " 9.3503e-02, -4.9759e-01, 1.5903e-01, -1.2017e-01, 3.4478e-01,\n",
549
+ " -2.1399e-01, 3.9456e-01, -3.2861e-01, 1.7182e-01, -1.1697e-01,\n",
550
+ " 5.6727e-03, -1.9770e-01, -2.3682e-01, 2.7554e-01, -3.9236e-01,\n",
551
+ " 2.0691e-01, 1.6439e-01, -3.7138e-01, -7.8304e-01, -1.9874e-01,\n",
552
+ " 6.4637e-01, -2.4494e-01, -4.1920e-01, -3.7675e-01, 1.3178e-01,\n",
553
+ " 1.9076e-01, -1.2906e-01, -6.4864e-04, -9.7821e-03, -1.2172e-01,\n",
554
+ " -5.5357e-02, 2.2997e-01, -3.2848e-01, -4.1649e-01, 9.9676e-04,\n",
555
+ " -4.5320e-01, -2.2864e-01, -1.6760e-01, -7.9657e-02, -6.0780e-02,\n",
556
+ " -1.7627e-01, -4.1947e-02, 2.3884e-01, -4.7784e-03, -3.1593e-01,\n",
557
+ " -1.0243e-01, 5.3464e-01, 2.7388e-01, -4.2258e-02, -1.5521e-01,\n",
558
+ " -1.0183e-01, -2.9342e-01, -1.0132e+00, 2.3122e-01, -3.3482e-01,\n",
559
+ " 3.2136e-01, -2.3603e-01, -1.4938e-01, -2.3986e-01, 6.1094e-02,\n",
560
+ " 1.6784e-01, -3.8075e-02, 5.6459e-01, -2.0828e-02, -1.7406e-01,\n",
561
+ " -2.9475e-01, -5.0143e-01, -1.6885e-01, 4.4070e-01, 3.1866e-01,\n",
562
+ " -2.7534e-01, 4.1410e-01, -7.2704e-02, -2.9659e-01, 3.0922e-01,\n",
563
+ " -5.1553e-01, -2.7293e-01, -1.2403e-01, 5.3698e-01, 8.8994e-02,\n",
564
+ " 4.1334e-01, 2.5389e-01, 6.0110e-01, -2.3192e-01, -9.9463e+00,\n",
565
+ " 3.8342e-01, -3.4833e-01, 3.5175e-02, -3.3336e-01, 2.5660e-01,\n",
566
+ " 8.5744e-01, -3.4563e-01, 3.0483e-03, 3.4735e-01, 3.8450e-01,\n",
567
+ " 3.9665e-01, 2.2100e-01, 6.5109e-02, -5.5761e-01, -6.2348e-01,\n",
568
+ " -1.8679e-01, 1.9003e-01, 7.4262e-02, -5.9655e-02, -3.9839e-01,\n",
569
+ " -2.2625e-02, -7.6319e-02, 2.9763e-01, 1.4098e-01, -2.8759e-01,\n",
570
+ " -4.0783e-01, 1.1544e-01, 3.2446e-01, -2.9828e-01, 1.4054e-02,\n",
571
+ " 1.6943e-01, -2.0345e-01, -2.1174e-02, 1.1417e-01, 3.3420e-01,\n",
572
+ " -1.0892e-01, -3.1187e-01, -5.7087e-01, -1.1561e-02, 4.2107e-02,\n",
573
+ " 4.9406e-01, -3.7056e-01, -3.2354e-01, 5.4846e-02, 2.4392e-01,\n",
574
+ " -1.2840e-01, -4.3743e-01, 2.4391e-01, 2.1046e-01, -6.3811e-01,\n",
575
+ " 3.5563e-01, -2.0561e-01, -3.0996e-01, 1.6479e-01, -5.1947e-02,\n",
576
+ " 3.2559e-01, -6.3670e-03, -2.7855e-01, -4.2847e-01, -1.2022e-01,\n",
577
+ " 4.0702e-01, 9.6086e-01, 1.3305e-01, -2.0369e-01, 7.5751e-02,\n",
578
+ " -1.2915e-01, -8.5741e-02, 2.7087e-01, 9.1068e-02, -1.5946e-01,\n",
579
+ " 4.7289e-01, 1.0613e-01, 1.3504e-01, 2.7304e-01, -7.9823e-01,\n",
580
+ " 1.1986e-01, 4.7432e-01, -1.4133e-01, 3.9729e-01, -1.6949e-01,\n",
581
+ " -9.2290e-01, -1.9302e-01, -7.9017e-02, -6.5796e-01, 1.3385e-02,\n",
582
+ " 1.6185e-01, -3.4487e-01, 5.8601e-01, -1.5023e-01, 5.8034e-01,\n",
583
+ " -2.8326e-01, -1.6494e-01, -2.9796e-01, 6.7479e-03, -6.3622e-01,\n",
584
+ " -1.7732e-02, -1.6043e-01, -8.2452e-01, -2.4934e-02, -1.3969e-01,\n",
585
+ " -1.2475e-01, 2.1235e-01, 6.9211e-02, 1.1795e-01, -2.5098e-02,\n",
586
+ " 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01, 2.7441e-01,\n",
587
+ " -2.6457e-01, -3.3007e-01, -3.1083e-01, 4.9623e-01, -2.7829e-01,\n",
588
+ " -3.0000e-01, -2.5620e-01, 2.1623e-01, -1.0724e-01, -5.0995e-01,\n",
589
+ " -4.9460e-01, 8.4283e-02, -3.2844e-01, -6.0080e-01, -1.1809e-01,\n",
590
+ " 1.1040e-01, 3.7749e-02, 3.9097e-01, 2.7157e-02, -3.5270e-01,\n",
591
+ " -1.0008e-01, -3.1026e-01, -1.9041e-01, 3.7090e-01, -4.5056e-01,\n",
592
+ " -8.3087e-02, -3.6450e-01, -1.0154e+00, -1.3134e-01, -5.0261e-02,\n",
593
+ " 3.6961e-01, -1.1989e-01, -1.2336e-01, 2.6829e-01, -6.0926e-01,\n",
594
+ " -3.0037e-01, -1.0460e+00, -2.1501e-01, 1.7171e-01, 1.7970e-02,\n",
595
+ " -2.0708e-01, -1.3656e-01, -3.2854e-01, 1.2158e-01, -3.0438e-01,\n",
596
+ " -4.6487e-02, 1.8717e-01, -2.3236e-01, -1.4668e-01, -6.9169e-01,\n",
597
+ " -2.1502e-01, -1.2722e-01, 3.5600e-01, 1.5203e-03, -3.7041e-01,\n",
598
+ " -6.5877e-01, 2.1490e-01, -5.1359e-02, 2.2720e-01, -1.6363e-01,\n",
599
+ " -1.0862e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, -2.5987e-01,\n",
600
+ " -2.0222e-01, 3.4466e-02, 5.8733e-01, -1.6877e-01, -4.8642e-01,\n",
601
+ " -7.8254e-03, 1.2950e-01, -5.6791e-01, -6.6342e-01, -1.5021e-01,\n",
602
+ " -4.4367e-01, -2.8434e-01, -1.7593e-01, -4.2538e-01, -3.7350e-01,\n",
603
+ " -4.0185e-02, -6.1727e-01, 2.3771e-01, -4.1247e-01, 3.9440e-01,\n",
604
+ " 1.0506e-01, -4.0222e-01, 5.9232e-01])\n",
605
+ "TF-IDF Keywords: [('rus', np.float64(0.33567254331867563)), ('ahecaqo', np.float64(0.25175440748900674)), ('000', np.float64(0.16783627165933782)), ('1777', np.float64(0.16783627165933782)), ('ile', np.float64(0.16783627165933782)), ('pşıqo', np.float64(0.16783627165933782)), ('türkçe', np.float64(0.16783627165933782)), ('vardı', np.float64(0.16783627165933782)), ('çerkes', np.float64(0.16783627165933782)), ('çerkesya', np.float64(0.16783627165933782)), ('12', np.float64(0.08391813582966891)), ('1837', np.float64(0.08391813582966891)), ('1838', np.float64(0.08391813582966891)), ('adamıydı', np.float64(0.08391813582966891)), ('adlardaki', np.float64(0.08391813582966891)), ('anlamlıdır', np.float64(0.08391813582966891)), ('anlamına', np.float64(0.08391813582966891)), ('askeri', np.float64(0.08391813582966891)), ('askerî', np.float64(0.08391813582966891)), ('atlıdan', np.float64(0.08391813582966891)), ('atlıya', np.float64(0.08391813582966891)), ('az', np.float64(0.08391813582966891)), ('becerisinin', np.float64(0.08391813582966891)), ('belirtir', np.float64(0.08391813582966891)), ('beyoğlu', np.float64(0.08391813582966891)), ('bilgiler', np.float64(0.08391813582966891)), ('birliklerine', np.float64(0.08391813582966891)), ('biyografi', np.float64(0.08391813582966891)), ('bjeduğ', np.float64(0.08391813582966891)), ('bölgesinde', np.float64(0.08391813582966891)), ('büyütüldü', np.float64(0.08391813582966891)), ('devlet', np.float64(0.08391813582966891)), ('doğdu', np.float64(0.08391813582966891)), ('doğumlular', np.float64(0.08391813582966891)), ('duyulan', np.float64(0.08391813582966891)), ('dönem', np.float64(0.08391813582966891)), ('ek', np.float64(0.08391813582966891)), ('ekiyle', np.float64(0.08391813582966891)), ('erken', np.float64(0.08391813582966891)), ('eğitim', np.float64(0.08391813582966891)), ('eş', np.float64(0.08391813582966891)), ('geldiği', np.float64(0.08391813582966891)), ('gelen', np.float64(0.08391813582966891)), ('geçti', np.float64(0.08391813582966891)), ('hakkında', np.float64(0.08391813582966891)), ('hastalıktan', np.float64(0.08391813582966891)), ('ismi', np.float64(0.08391813582966891)), ('ismidir', np.float64(0.08391813582966891)), ('için', np.float64(0.08391813582966891)), ('kafkas', np.float64(0.08391813582966891)), ('kalıcı', np.float64(0.08391813582966891)), ('katılımı', np.float64(0.08391813582966891)), ('kaydedilmedi', np.float64(0.08391813582966891)), ('kaynak', np.float64(0.08391813582966891)), ('kaynaklarından', np.float64(0.08391813582966891)), ('kaynakça', np.float64(0.08391813582966891)), ('kazandığı', np.float64(0.08391813582966891)), ('kişi', np.float64(0.08391813582966891)), ('kişiler', np.float64(0.08391813582966891)), ('lakapları', np.float64(0.08391813582966891)), ('lakaptır', np.float64(0.08391813582966891)), ('müfrezesi', np.float64(0.08391813582966891)), ('nda', np.float64(0.08391813582966891)), ('nun', np.float64(0.08391813582966891)), ('nın', np.float64(0.08391813582966891)), ('olduğunu', np.float64(0.08391813582966891)), ('oluşan', np.float64(0.08391813582966891)), ('ordusu', np.float64(0.08391813582966891)), ('oğlu', np.float64(0.08391813582966891)), ('pek', np.float64(0.08391813582966891)), ('qo', np.float64(0.08391813582966891)), ('savaşı', np.float64(0.08391813582966891)), ('savaşına', np.float64(0.08391813582966891)), ('saygı', np.float64(0.08391813582966891)), ('sim', np.float64(0.08391813582966891)), ('soneki', np.float64(0.08391813582966891)), ('sonra', np.float64(0.08391813582966891)), ('soy', np.float64(0.08391813582966891)), ('soyadları', np.float64(0.08391813582966891)), ('soylular', np.float64(0.08391813582966891)), ('sıra', np.float64(0.08391813582966891)), ('sırasında', np.float64(0.08391813582966891)), ('tarafına', np.float64(0.08391813582966891)), ('tarihlerini', np.float64(0.08391813582966891)), ('ulaşıyordu', np.float64(0.08391813582966891)), ('yazmadıkları', np.float64(0.08391813582966891)), ('yıl', np.float64(0.08391813582966891)), ('zaferlerle', np.float64(0.08391813582966891)), ('çatışmalar', np.float64(0.08391813582966891)), ('çerkesler', np.float64(0.08391813582966891)), ('çerkeslerin', np.float64(0.08391813582966891)), ('öldü', np.float64(0.08391813582966891)), ('ölenler', np.float64(0.08391813582966891)), ('ölüm', np.float64(0.08391813582966891)), ('ünlüydü', np.float64(0.08391813582966891))]\n",
606
+ "BERT Embeddings:\n",
607
+ "Text 1 embedding shape: torch.Size([233, 768])\n"
608
+ ]
609
+ }
610
+ ],
611
+ "source": [
612
+ "#-------------------------tf-ıdf hesaplama\n",
613
+ "import re\n",
614
+ "import numpy as np\n",
615
+ "import pandas as pd\n",
616
+ "from nltk.stem import WordNetLemmatizer\n",
617
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
618
+ "from nltk.corpus import stopwords as nltk_stopwords\n",
619
+ "from transformers import BertTokenizer, BertModel\n",
620
+ "import torch\n",
621
+ "\n",
622
+ "# BERT Tokenizer ve Model'i yükleyin\n",
623
+ "tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
624
+ "model = BertModel.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
625
+ "\n",
626
+ "\n",
627
+ "#-------------------------- burada turkish_stop_words'ü alıyoruz\n",
628
+ "def load_stop_words(file_path):\n",
629
+ " \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur.\"\"\"\n",
630
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
631
+ " stop_words = [line.strip() for line in file if line.strip()]\n",
632
+ " return stop_words\n",
633
+ "\n",
634
+ "# Türkçe stop words dosyasını yükleyin\n",
635
+ "stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
636
+ "\n",
637
+ "#gömülen kelimeleri k-means ile kümeleyebiliriz , benzerlik oranını hesaplamak için farklı algoritmalardan yararlanabiliriz.\n",
638
+ "def get_bert_embeddings(text):\n",
639
+ " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)\n",
640
+ " with torch.no_grad():\n",
641
+ " outputs = model(**inputs)\n",
642
+ " # Son katmandaki gömme (embedding) çıktısını alın\n",
643
+ " return inputs['input_ids'],outputs.last_hidden_state\n",
644
+ "\n",
645
+ "#--------------------------- textleri tokenize eden fonksiyon \n",
646
+ "def get_token_embeddings(text):\n",
647
+ " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)\n",
648
+ " with torch.no_grad():\n",
649
+ " outputs = model(**inputs)\n",
650
+ " embeddings = outputs.last_hidden_state\n",
651
+ " return embeddings\n",
652
+ "\n",
653
+ "#------------------------------------ token verilerinin ortalaması (eşik değer için)\n",
654
+ "def average_embeddings(embeddings):\n",
655
+ " # Token vektörlerinin ortalamasını alarak metin düzeyinde özet oluştur\n",
656
+ " return torch.mean(embeddings, dim=1).squeeze()\n",
657
+ "\n",
658
+ "#keywordsler çıkarmak için kullanacağım fonksiyon \n",
659
+ "def extract_keywords_tfidf(corpus,stop_words_list):\n",
660
+ " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
661
+ " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
662
+ " X = vectorizer.fit_transform(corpus)\n",
663
+ " feature_names = vectorizer.get_feature_names_out()\n",
664
+ " scores = np.asarray(X.sum(axis=0)).flatten()\n",
665
+ " keywords = {feature_names[i]: scores[i] for i in range(len(feature_names))}\n",
666
+ " sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)\n",
667
+ " return sorted_keywords\n",
668
+ "\n",
669
+ "#tokenleri kelimelere dönüştürür ve listeler \n",
670
+ "def decode_tokens(input_ids):\n",
671
+ " # Token ID'lerini kelimelere dönüştür\n",
672
+ " tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())\n",
673
+ " return tokens\n",
674
+ "\n",
675
+ "# Örnek metinler (buranın yerine combined_text kullanılacak)\n",
676
+ "texts = [\"\"\"Biyografi\n",
677
+ "İsim \n",
678
+ "Pşıqo ismi Ahecaqo soy ismidir. Çerkeslerin \"-qo\" soneki ile biten hem soyadları hem de lakapları vardı. Bu ek Türkçe adlardaki \"-oğlu\" ekiyle eş anlamlıdır. Pşıqo Türkçe \"Beyoğlu\" anlamına gelen bir lakaptır.\n",
679
+ "\n",
680
+ "Erken dönem \n",
681
+ "Çerkesler tarihlerini yazmadıkları ve tüm bilgiler Rus kaynaklarından geldiği için Ahecaqo hakkında pek bir şey kaydedilmedi. 1777'de Çerkesya'nın Bjeduğ bölgesinde doğdu. Askerî eğitim ile büyütüldü.\n",
682
+ "\n",
683
+ "Rus-Çerkes Savaşına Katılımı \n",
684
+ "Birkaç kaynak, Ahecaqo'nun tüm Çerkesya'da saygı duyulan bir kişi olduğunu belirtir. En az 6.000 atlıdan oluşan kalıcı bir ordusu vardı ve çatışmalar sırasında müfrezesi 12.000 atlıya ulaşıyordu. Rus birliklerine karşı kazandığı zaferlerle ünlüydü. Askeri becerisinin yanı sıra yetenekli bir devlet adamıydı.\n",
685
+ "\n",
686
+ "Ölüm \n",
687
+ "1837 yılında Rus tarafına geçti ve bir yıl sonra hastalıktan öldü.\n",
688
+ "\n",
689
+ "Kaynakça \n",
690
+ "\n",
691
+ "Çerkes soylular\n",
692
+ "1777 doğumlular\n",
693
+ "1838 yılında ölenler\n",
694
+ "Kafkas Savaşı'nda kişiler \"\"\"]\n",
695
+ " \n",
696
+ " \n",
697
+ "\n",
698
+ " \n",
699
+ "\n",
700
+ "\n",
701
+ "#token ıd leri ve bert gömme vektörleri\n",
702
+ "for text in texts:\n",
703
+ " input_ids,embeddings= get_bert_embeddings(text)\n",
704
+ " \n",
705
+ " # BERT gömme vektörlerini elde et\n",
706
+ " #embeddings = [get_bert_embeddings(text) for text in texts]\n",
707
+ "\n",
708
+ " # Tokenları ve ortalama vektörleri al\n",
709
+ " tokens = decode_tokens(input_ids)\n",
710
+ " avg_embedding = average_embeddings(embeddings)\n",
711
+ " print(f\"Tokens: {tokens}\")\n",
712
+ " print(f\"Average Embedding Shape: {avg_embedding.shape}\")\n",
713
+ " print(f\"Average Embedding: {avg_embedding}\")\n",
714
+ "\n",
715
+ "# TF-IDF anahtar kelimelerini çıkar\n",
716
+ "keywords = extract_keywords_tfidf(texts,stop_words_list)\n",
717
+ "print(\"TF-IDF Keywords:\", keywords)\n",
718
+ "\n",
719
+ "# Gösterim\n",
720
+ "print(\"BERT Embeddings:\")\n",
721
+ "for i, emb in enumerate(embeddings):\n",
722
+ " print(f\"Text {i+1} embedding shape: {emb.shape}\")\n",
723
+ "\n",
724
+ "\n",
725
+ "\n"
726
+ ]
727
+ },
728
+ {
729
+ "cell_type": "code",
730
+ "execution_count": 8,
731
+ "metadata": {},
732
+ "outputs": [
733
+ {
734
+ "name": "stdout",
735
+ "output_type": "stream",
736
+ "text": [
737
+ "Keywords without stop words:\n",
738
+ "[('bir', np.float64(0.5)), ('bu', np.float64(0.5)), ('cümledir', np.float64(0.5)), ('örnek', np.float64(0.5)), ('anahtar', np.float64(0.3779644730092272)), ('kelimeleri', np.float64(0.3779644730092272)), ('kullanarak', np.float64(0.3779644730092272)), ('stop', np.float64(0.3779644730092272)), ('türkçe', np.float64(0.3779644730092272)), ('words', np.float64(0.3779644730092272)), ('çıkarıyoruz', np.float64(0.3779644730092272))]\n",
739
+ "\n",
740
+ "Keywords with stop words:\n",
741
+ "[('cümledir', np.float64(0.7071067811865476)), ('örnek', np.float64(0.7071067811865476)), ('anahtar', np.float64(0.3779644730092272)), ('kelimeleri', np.float64(0.3779644730092272)), ('kullanarak', np.float64(0.3779644730092272)), ('stop', np.float64(0.3779644730092272)), ('türkçe', np.float64(0.3779644730092272)), ('words', np.float64(0.3779644730092272)), ('çıkarıyoruz', np.float64(0.3779644730092272))]\n",
742
+ "\n",
743
+ "Keywords removed by stop words list:\n",
744
+ "{'bu', 'bir'}\n",
745
+ "\n",
746
+ "New keywords added by stop words list:\n",
747
+ "set()\n"
748
+ ]
749
+ }
750
+ ],
751
+ "source": [
752
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
753
+ "\n",
754
+ "def test_stop_words_effectiveness(corpus, stop_words_list):\n",
755
+ " \"\"\"Stop words listesinin etkisini test eder.\"\"\"\n",
756
+ " # İlk olarak, stop words olmadan TF-IDF hesaplayın\n",
757
+ " vectorizer_no_stop_words = TfidfVectorizer()\n",
758
+ " X_no_stop_words = vectorizer_no_stop_words.fit_transform(corpus)\n",
759
+ " feature_names_no_stop_words = vectorizer_no_stop_words.get_feature_names_out()\n",
760
+ " scores_no_stop_words = np.asarray(X_no_stop_words.sum(axis=0)).flatten()\n",
761
+ " keywords_no_stop_words = {feature_names_no_stop_words[i]: scores_no_stop_words[i] for i in range(len(feature_names_no_stop_words))}\n",
762
+ " sorted_keywords_no_stop_words = sorted(keywords_no_stop_words.items(), key=lambda x: x[1], reverse=True)\n",
763
+ "\n",
764
+ " # Şimdi, stop words ile TF-IDF hesaplayın\n",
765
+ " vectorizer_with_stop_words = TfidfVectorizer(stop_words=stop_words_list)\n",
766
+ " X_with_stop_words = vectorizer_with_stop_words.fit_transform(corpus)\n",
767
+ " feature_names_with_stop_words = vectorizer_with_stop_words.get_feature_names_out()\n",
768
+ " scores_with_stop_words = np.asarray(X_with_stop_words.sum(axis=0)).flatten()\n",
769
+ " keywords_with_stop_words = {feature_names_with_stop_words[i]: scores_with_stop_words[i] for i in range(len(feature_names_with_stop_words))}\n",
770
+ " sorted_keywords_with_stop_words = sorted(keywords_with_stop_words.items(), key=lambda x: x[1], reverse=True)\n",
771
+ " \n",
772
+ " # Stop words listesi etkisini gözlemleyin\n",
773
+ " print(\"Keywords without stop words:\")\n",
774
+ " print(sorted_keywords_no_stop_words)\n",
775
+ " \n",
776
+ " print(\"\\nKeywords with stop words:\")\n",
777
+ " print(sorted_keywords_with_stop_words)\n",
778
+ " \n",
779
+ " # Farklılıkları göster\n",
780
+ " all_keywords_no_stop_words = set([kw[0] for kw in sorted_keywords_no_stop_words])\n",
781
+ " all_keywords_with_stop_words = set([kw[0] for kw in sorted_keywords_with_stop_words])\n",
782
+ " \n",
783
+ " print(\"\\nKeywords removed by stop words list:\")\n",
784
+ " print(all_keywords_no_stop_words - all_keywords_with_stop_words)\n",
785
+ " \n",
786
+ " print(\"\\nNew keywords added by stop words list:\")\n",
787
+ " print(all_keywords_with_stop_words - all_keywords_no_stop_words)\n",
788
+ "\n",
789
+ "# Test verisi ve stop words listesi kullanarak test edin\n",
790
+ "test_stop_words_effectiveness(texts, stop_words_list)\n"
791
+ ]
792
+ },
793
  {
794
  "cell_type": "code",
795
  "execution_count": 20,
 
837
  "from tqdm.auto import tqdm, trange\n",
838
  "import tensorflow as tf\n",
839
  "import nltk\n",
840
+ "import re \n",
841
  "from nltk.stem import WordNetLemmatizer\n",
842
  "from nltk.corpus import stopwords\n",
843
  "\n",
 
867
  " document_count = len(combined_text)\n",
868
  " return combined_text, document_count\n",
869
  " \n",
870
+ "\n",
871
+ " \n",
872
+ " nltk.download('turkish_stop_words')\n",
873
+ " data_without_stopwords = []\n",
874
+ " for i in range(0, len(response)):\n",
875
+ " doc = re.sub('[^a-zA-Z]', ' ', response[i])\n",
876
+ " doc = doc.lower()\n",
877
+ " doc = doc.split()\n",
878
+ " doc = [lemmatizer.lemmatize(word) for word in doc if not word in set(stopwords)]\n",
879
+ " doc = ' '.join(doc)\n",
880
+ " data_without_stopwords.append(doc)\n",
881
+ "\n",
882
+ " #print ilk satır orjinal datasetteki\n",
883
+ " print(data.response[0])\n",
884
+ "\n",
885
  " # Calculate TF-IDF and get feature names\n",
886
  " @staticmethod\n",
887
  " def calculate_tfidf(documents, stop_words):\n",
888
  " vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000,min_df=2)\n",
889
+ " vectors = vectorizer.fit_transform(data_without_stopwords)\n",
890
  " tfidf_matrix = vectorizer.fit_transform(documents)\n",
891
  " feature_names = vectorizer.get_feature_names_out()\n",
892
  " return tfidf_matrix, feature_names\n",
893
  "\n",
894
  " # Extract keywords using TF-IDF\n",
895
+ " def extract_keywords(tfidf_matrix, feature_names, top_n=10):\n",
896
  " keywords = {}\n",
897
  " for doc_idx, row in enumerate(tfidf_matrix):\n",
898
  " filtered_feature_names = [name for name in feature_names if name.lower() not in stop_words]\n",