Train and test splitter. Clean up comments and code a bit (but not too much).

Files changed (4) hide show

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 font_images
 .DS_Store

 font_images
+train_test_images
 .DS_Store

README.md CHANGED Viewed

@@ -6,3 +6,6 @@ Follow along:
 - [On Pebble.social](https://pebble.social/@gabor/111376050835874755)
 - [On Threads.net](https://www.threads.net/@gaborcselle/post/CzZJpJCpxTz)
 - [On Twitter](https://twitter.com/gabor/status/1722300841691103467)

 - [On Pebble.social](https://pebble.social/@gabor/111376050835874755)
 - [On Threads.net](https://www.threads.net/@gaborcselle/post/CzZJpJCpxTz)
 - [On Twitter](https://twitter.com/gabor/status/1722300841691103467)
+Generate sample images (note this will work only on Mac): [gen_sample_data.py]
+Arrange test images into test and train: [arrange_train_test_images.py]

arrange_train_test_images.py ADDED Viewed

+# moves the font images into train and test folders
+# TODO(gabor): maybe we should copy these instead, so we don't have to regenerate the images every times?
+import os
+import shutil
+import random
+source_dir = './font_images'
+organized_dir = './train_test_images'
+train_dir = os.path.join(organized_dir, 'train')
+test_dir = os.path.join(organized_dir, 'test')
+# create directories if they don't exist
+os.makedirs(train_dir, exist_ok=True)
+os.makedirs(test_dir, exist_ok=True)
+# make a list of all the font names
+fonts = [f.split('_')[0] for f in os.listdir(source_dir) if f.endswith('.png')]
+fonts = list(set(fonts))  # getting unique font names
+for font in fonts:
+    font_train_dir = os.path.join(train_dir, font)
+    font_test_dir = os.path.join(test_dir, font)
+    os.makedirs(font_train_dir, exist_ok=True)
+    os.makedirs(font_test_dir, exist_ok=True)
+    font_files = [f for f in os.listdir(source_dir) if f.startswith(font)]
+    random.shuffle(font_files)
+    train_files = font_files[:int(0.8 * len(font_files))]
+    test_files = font_files[int(0.8 * len(font_files)):]
+    # Moving training files
+    for train_file in train_files:
+        shutil.move(os.path.join(source_dir, train_file), font_train_dir)
+    # Moving test files
+    for test_file in test_files:
+        shutil.move(os.path.join(source_dir, test_file), font_test_dir)

gen_sample_data.py CHANGED Viewed

@@ -10,9 +10,6 @@ import random
 # Download the necessary data from nltk
 nltk.download('brown')
-# Sample text for prose and code
-prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
 # Note that this will only work on MacOS where this is the default font directory
 font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
 output_dir = './font_images'
@@ -24,17 +21,17 @@ all_brown_words = sorted(set(brown.words(categories='news')))
 FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
 def wrap_text(text, line_length=10):
-    """
-    Wraps the provided text every 'line_length' words.
-    """
     words = text.split()
     return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
-def random_prose_text(words, num_words=200):  # Sample random words
     random_words = " ".join(random.sample(words, num_words))
     return wrap_text(random_words)
-def random_code_text(base_code, num_lines=15):  # Increase number of lines
     lines = base_code.split("\n")
     return "\n".join(random.sample(lines, min(num_lines, len(lines))))

 # Download the necessary data from nltk
 nltk.download('brown')
 # Note that this will only work on MacOS where this is the default font directory
 font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
 output_dir = './font_images'
 FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
 def wrap_text(text, line_length=10):
+    """Wraps the provided text every 'line_length' words."""
     words = text.split()
     return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
+def random_prose_text(words, num_words=200):
+    """Returns a random selection of 'num_words' words from the provided list of words."""
     random_words = " ".join(random.sample(words, num_words))
     return wrap_text(random_words)
+def random_code_text(base_code, num_lines=15):
+    """Returns a random selection of 'num_lines' lines from the provided code."""
     lines = base_code.split("\n")
     return "\n".join(random.sample(lines, min(num_lines, len(lines))))