Gabor Cselle
commited on
Commit
•
95ccd40
1
Parent(s):
41d52be
Train and test splitter. Clean up comments and code a bit (but not too much).
Browse files- .gitignore +1 -0
- README.md +3 -0
- arrange_train_test_images.py +38 -0
- gen_sample_data.py +5 -8
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
font_images
|
|
|
2 |
.DS_Store
|
|
|
1 |
font_images
|
2 |
+
train_test_images
|
3 |
.DS_Store
|
README.md
CHANGED
@@ -6,3 +6,6 @@ Follow along:
|
|
6 |
- [On Pebble.social](https://pebble.social/@gabor/111376050835874755)
|
7 |
- [On Threads.net](https://www.threads.net/@gaborcselle/post/CzZJpJCpxTz)
|
8 |
- [On Twitter](https://twitter.com/gabor/status/1722300841691103467)
|
|
|
|
|
|
|
|
6 |
- [On Pebble.social](https://pebble.social/@gabor/111376050835874755)
|
7 |
- [On Threads.net](https://www.threads.net/@gaborcselle/post/CzZJpJCpxTz)
|
8 |
- [On Twitter](https://twitter.com/gabor/status/1722300841691103467)
|
9 |
+
|
10 |
+
Generate sample images (note this will work only on Mac): [gen_sample_data.py]
|
11 |
+
Arrange test images into test and train: [arrange_train_test_images.py]
|
arrange_train_test_images.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# moves the font images into train and test folders
|
2 |
+
# TODO(gabor): maybe we should copy these instead, so we don't have to regenerate the images every times?
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import random
|
6 |
+
|
7 |
+
source_dir = './font_images'
|
8 |
+
organized_dir = './train_test_images'
|
9 |
+
train_dir = os.path.join(organized_dir, 'train')
|
10 |
+
test_dir = os.path.join(organized_dir, 'test')
|
11 |
+
|
12 |
+
# create directories if they don't exist
|
13 |
+
os.makedirs(train_dir, exist_ok=True)
|
14 |
+
os.makedirs(test_dir, exist_ok=True)
|
15 |
+
|
16 |
+
# make a list of all the font names
|
17 |
+
fonts = [f.split('_')[0] for f in os.listdir(source_dir) if f.endswith('.png')]
|
18 |
+
fonts = list(set(fonts)) # getting unique font names
|
19 |
+
|
20 |
+
for font in fonts:
|
21 |
+
font_train_dir = os.path.join(train_dir, font)
|
22 |
+
font_test_dir = os.path.join(test_dir, font)
|
23 |
+
os.makedirs(font_train_dir, exist_ok=True)
|
24 |
+
os.makedirs(font_test_dir, exist_ok=True)
|
25 |
+
|
26 |
+
font_files = [f for f in os.listdir(source_dir) if f.startswith(font)]
|
27 |
+
random.shuffle(font_files)
|
28 |
+
|
29 |
+
train_files = font_files[:int(0.8 * len(font_files))]
|
30 |
+
test_files = font_files[int(0.8 * len(font_files)):]
|
31 |
+
|
32 |
+
# Moving training files
|
33 |
+
for train_file in train_files:
|
34 |
+
shutil.move(os.path.join(source_dir, train_file), font_train_dir)
|
35 |
+
|
36 |
+
# Moving test files
|
37 |
+
for test_file in test_files:
|
38 |
+
shutil.move(os.path.join(source_dir, test_file), font_test_dir)
|
gen_sample_data.py
CHANGED
@@ -10,9 +10,6 @@ import random
|
|
10 |
# Download the necessary data from nltk
|
11 |
nltk.download('brown')
|
12 |
|
13 |
-
# Sample text for prose and code
|
14 |
-
prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
|
15 |
-
|
16 |
# Note that this will only work on MacOS where this is the default font directory
|
17 |
font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
|
18 |
output_dir = './font_images'
|
@@ -24,17 +21,17 @@ all_brown_words = sorted(set(brown.words(categories='news')))
|
|
24 |
FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
|
25 |
|
26 |
def wrap_text(text, line_length=10):
|
27 |
-
"""
|
28 |
-
Wraps the provided text every 'line_length' words.
|
29 |
-
"""
|
30 |
words = text.split()
|
31 |
return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
|
32 |
|
33 |
-
def random_prose_text(words, num_words=200):
|
|
|
34 |
random_words = " ".join(random.sample(words, num_words))
|
35 |
return wrap_text(random_words)
|
36 |
|
37 |
-
def random_code_text(base_code, num_lines=15):
|
|
|
38 |
lines = base_code.split("\n")
|
39 |
return "\n".join(random.sample(lines, min(num_lines, len(lines))))
|
40 |
|
|
|
10 |
# Download the necessary data from nltk
|
11 |
nltk.download('brown')
|
12 |
|
|
|
|
|
|
|
13 |
# Note that this will only work on MacOS where this is the default font directory
|
14 |
font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
|
15 |
output_dir = './font_images'
|
|
|
21 |
FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
|
22 |
|
23 |
def wrap_text(text, line_length=10):
|
24 |
+
"""Wraps the provided text every 'line_length' words."""
|
|
|
|
|
25 |
words = text.split()
|
26 |
return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
|
27 |
|
28 |
+
def random_prose_text(words, num_words=200):
|
29 |
+
"""Returns a random selection of 'num_words' words from the provided list of words."""
|
30 |
random_words = " ".join(random.sample(words, num_words))
|
31 |
return wrap_text(random_words)
|
32 |
|
33 |
+
def random_code_text(base_code, num_lines=15):
|
34 |
+
"""Returns a random selection of 'num_lines' lines from the provided code."""
|
35 |
lines = base_code.split("\n")
|
36 |
return "\n".join(random.sample(lines, min(num_lines, len(lines))))
|
37 |
|