|
#!/bin/bash |
|
|
|
function usage { |
|
echo "usage: <n>$0" |
|
echo "note: n is the number of essays to download" |
|
echo "for specific n, the resulting pg.txt file will have the following number of tokens:" |
|
echo "n | tokens" |
|
echo "--- | ---" |
|
echo "1 | 6230" |
|
echo "2 | 23619" |
|
echo "5 | 25859" |
|
echo "10 | 36888" |
|
echo "15 | 50188" |
|
echo "20 | 59094" |
|
echo "25 | 88764" |
|
echo "30 | 103121" |
|
echo "32 | 108338" |
|
echo "35 | 113403" |
|
echo "40 | 127699" |
|
echo "45 | 135896" |
|
exit 1 |
|
} |
|
|
|
function has_cmd { |
|
if ! [ -x "$(command -v $1)" ]; then |
|
echo "error: $1 is not available" >&2 |
|
exit 1 |
|
fi |
|
} |
|
|
|
|
|
has_cmd curl |
|
has_cmd html2text |
|
has_cmd tail |
|
has_cmd sed |
|
|
|
if [ $# -ne 1 ]; then |
|
usage |
|
fi |
|
|
|
n=$1 |
|
|
|
|
|
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)" |
|
|
|
printf "urls:\n%s\n" "$urls" |
|
|
|
if [ -f pg.txt ]; then |
|
rm pg.txt |
|
fi |
|
|
|
c=1 |
|
for url in $urls; do |
|
echo "processing $url" |
|
|
|
cc=$(printf "%03d" $c) |
|
|
|
curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt |
|
cat pg-$cc-one.txt >> pg.txt |
|
|
|
cp -v pg.txt pg-$cc-all.txt |
|
c=$((c+1)) |
|
|
|
|
|
sleep 1 |
|
done |
|
|
|
echo "done. data in pg.txt" |
|
|
|
exit 0 |
|
|