High Variance

Choosing the Right Tool for the Job

Every new programming project involves choosing a language early in the process. This choice can be exciting and daunting at the same time, as it always involves weighing several conflicting factors. Based on many years of varied programming experience, here is my advice:

  1. Pick a language appropriate to the task. Do you need good numerical optimization libraries? Do you need matrix operations? Do you need to parse HTML files? You don’t need the best language, but you don’t want to be stuck with something too slow or that requires you to write a lot of unnecessary support code. This is what John D. Cook is getting at when he says “Don’t be a technical masochist”. I always think of this first even though it’s not always the most important criterion.

  2. Take into account the fact that you might have to learn (or relearn) the language. This is the dreaded learning curve and even though I’m a quick learner, I always underestimate the time it takes to achieve proficiency.

  3. Think about the future utility of knowing the language. This is harder than it looks. On the one hand Python is broadly popular and winning in scientific computing. Ruby is a similar language but has better libraries for web development. Both seem like good investments, but I haven’t used Python in more than 5 years and I’ve never written anything in Ruby.

  4. If you are collaborating, think about what the rest of the team knows. Even if your partner isn’t writing as much code as you, it’s pretty valuable if they can read your code.

  5. Don’t forget to have fun. Programming can be a grind–sometimes it’s fun to stretch your brain with something new.

  6. Expertise in a language should not be squandered. It’s a lot of work pouring the core of a language (and it’s standard libraries) into your head, but once you do it, you can write a lot of good code very quickly. This knowledge depreciates quickly when you switch to different “better” tools for a given job.

I spend a fair amount of my time analyzing data with Stata, and that involves writing programs in a quirky proprietary language. For example, it has no variables, but it does have local and global macros. To evaluate a local macro, you surround its name with a back quote and a forward quote (`x’). These can be nested. Global variables are prefixed with a dollar sign ($x). That said, it has general file handling and regular expression functions, and you can do interesting programming with it.

Lately, I’ve been trying to maintain and build my expertise in Stata by using it for a broad range of programming tasks even when it’s not the objectively best tool for a job. That was the case a few months ago when I parsed up and analyzed the content on this site. The end result looks a little weird, but it works and my brain is in a better place for it:

(analyze.do) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/*
* Analyze articles published on High Variance in 2012 and 2013
*
* BTW: UCLA has way better documentation of file read than Stata:
*      http://www.ats.ucla.edu/stat/stata/faq/fileread.htm
*
*/

set more off

capture log close
log using analyze.log ,replace

global DEST "../source/images/analyze-2014-02-09"

capture program drop parse_post
capture program drop load_year_posts
file close _all

/* Given the name of a Markdown document, parse it and return its
   title, date (of publication), tags, and wordcount.
*/
program define parse_post ,rclass
    local mdfile "`1'"

    tempname fh
    file open `fh' using `mdfile' ,read

    local donewithyaml=0
    local wordcount=0
    /* Assume first line is just "---" marking beginning of YAML header */
    file read `fh' line
    file read `fh' line
    while r(eof)==0 {
      * di `"LINE :`line':"'
      if (!`donewithyaml') { // Parse header
        if (`"`line'"'=="---") {
           local donewithyaml = 1
        }
        else {
          local key = ""
          local value = ""
          if regexm(`"`line'"',`"([a-zA-z-]+): ["]*([^"]*)["]*"') {
            local key = regexs(1)
            local value = regexs(2)
            * di `"KEY: `key'"'
            * di `"VALUE: `value'"'
            if ("`key'"=="date") {
              local date = date("`value'", "YMD")
            }
            if ("`key'"=="categories") {
              local tags = "`value'"
            }
            if ("`key'"=="title") {
              local title = "`value'"
            }
          }
        }
      }
      else { // Count words in body of post
        local wcl = wordcount(`"`line'"')
        * display `"wordcount(`line'): `wcl'"'
        local wordcount = `wordcount' + `wcl'
      }
      file read `fh' line
    }
    file close `fh'

    return local title = "`title'"
    return scalar date = `date'
    return local tags = "`tags'"
    return scalar wordcount = `wordcount'

end

/* Given a year, load in the stats (using parse_post) for each article.
   The resulting data set is one observation per post. It also creates dummy
   variables for each referenced tag.
*/
program define load_year_posts
    local year "`1'"
    ! ls ~/octopress/source/_posts/`year'-* > /tmp/`year'-filelist.txt

    tempname fh
    file open `fh' using "/tmp/`year'-filelist.txt" ,read
    file read `fh' line
    local i 1
    qui set obs 1
    qui gen title=""
    qui gen tags=""
    qui gen wordcount=.
    qui gen postdate=.
    while r(eof)==0 {
      qui set obs `i'
      parse_post "`line'"
      qui replace title = `"`r(title)'"' if _n==`i'
      qui replace tags = "`r(tags)'" if _n==`i'
      qui replace postdate = r(date) if _n==`i'
      qui replace wordcount = r(wordcount) if _n==`i'
      local ntags = wordcount(r(tags))
      forval j = 1/`ntags' {
        local tag = subinstr(word(r(tags),`j'),"-","_",.)
        capture confirm var tag_`tag'
        if _rc != 0 {
          qui gen tag_`tag'=.
        }
        qui replace tag_`tag'=1 if _n==`i'
      }
      local i = `i' + 1
      file read `fh' line
    }
    file close `fh'
    foreach tagvar of varlist tag_* {
      qui replace `tagvar'=0 if `tagvar'==.
    }
    gen year=`year'
end

/* Load up 2012 and 3013 */

clear
load_year_posts 2012
tempfile y2012
save `y2012'
clear
load_year_posts 2013
append using `y2012'

local tags "tech politics thanks education kids economics music productivity conan kbr photo mundane christmas blog"

/* Are there any articles that do not have one of the above tags? */

local tagvars ""
local ifclause "if 1 "
local yvaroptions "yvaroptions(relabel("
local i=1
foreach t in `tags' {
  local tagvars = "`tagvars' tag_`t'"
  local ifclause = "`ifclause ' & tag_`t'==0"
  local yvaroptions `"`yvaroptions'`i' "`t'" "'
  local i = `i' + 1
}
local yvaroptions `"`yvaroptions'))"'
count `ifclause'
list title tags `ifclause'

di `"`yvaroptions'"'

/* Draw lots of pretty pictures.
*  Thanks Michael Mitchell for your awesome _Visual Guide to Stata Graphics_ Third Edition!
*/

graph hbar (sum) `tagvars' ,ascategory over(year) `yvaroptions' b1title("Article Count by tag")
graph export $DEST/tags.png ,replace

// Thanks for the overlaid histogram ATS!
// http://www.ats.ucla.edu/stat/stata/faq/histogram_overlay.htm

twoway (histogram wordcount if year==2012 ,frequency start(0) width(100) color(navy)) ///
       (histogram wordcount if year==2013 ,frequency start(0) width(100) fcolor(none) lcolor(black)) ///
       ,xtitle("") ytitle("Word Count Frequency") legend(label(1 "2012") label(2 "2013"))
graph export $DEST/wchist.png ,replace

bysort year: summarize wordcount ,detail

gen month = month(postdate)

gen ac=1
graph bar (sum) ac ,over(year) over(month) asyvars ///
      b1title("Month") ytitle("Article Count")
graph export $DEST/ac.png ,replace

graph bar (sum) wordcount ,over(year) over(month) asyvars ///
      b1title("Month") ytitle("Word Count")
graph export $DEST/wc.png ,replace

collapse (sum) ac wordcount ,by(year)
list

log close

Comments