## this example shows how to compare one vector of data to another. You may want
## to make lots of these comparisons and in specific ways (e.g., comparing
## speakers within a conversation) - this becomes a specific data management
## problem (how to efficiently perform operations over a particular dataset).
## For more on making these more complete comparison, see
## miserman.github.io/lingmatch/#comparisons


# this function will calculate Canberra similarity between each value in a vector:
canberra = function(a, b) 1 - abs(a - b) / (a + b + .0001)

# these are the standard LSM LIWC categories:
cats = c('ppron','ipron','article','adverb','conj','prep','auxverb','negate','quant')

# say you had a data object with these LIWC categories in columns, for example:
data = matrix(rpois(length(cats)*2, 20)/10, 2, length(cats), dimnames=list(c(), cats))

# you could compare rows using the canberra function:
lsm = canberra(data[1, cats], data[2, cats])

# then, if you want a single LSM score, you can average across categories.
# The as.numeric part accounts for data stored as text:
lsm = mean(as.numeric(lsm))


# you might alternatively have sources in the same row, differentiated by category name:
cats = c(cats, paste0(cats,'_2'))
data = matrix(rpois(length(cats)*2, 20)/10, 2, length(cats), dimnames=list(c(), cats))

# in which case you could compare within the row, over multiple rows:
lsm = canberra(data[, cats[1:9]], data[, cats[10:18]])

# if you have more than one row, they can be aggregated with rowMeans:
lsm = rowMeans(lsm)


##finally, a more explicit version that may be more usable for people who use R for statistics but are not familiar with programming!
#data should be set up in a "wide" format, so that every row represents a dyad
#partners in each dyad should have LIWC results organized side-by-side
#LIWC categories should be labeled to indicate which person the results belong to; for example, "_w" for wife
#calculate each category-level LSM score
data$lsm_ppron_hw=1-((abs(data$ppron_h-data$ppron_w))/(data$ppron_h+data$ppron_w+.0001))
data$lsm_ipron_hw=1-((abs(data$ipron_h-data$ipron_w))/(data$ipron_h+data$ipron_w+.0001))
data$lsm_art_hw=1-((abs(data$article_h-data$article_w))/(data$article_h+data$article_w+.0001))
data$lsm_aux_hw=1-((abs(data$auxverb_h-data$auxverb_w))/(data$auxverb_h+data$auxverb_w+.0001))
data$lsm_adv_hw=1-((abs(data$adverb_h-data$adverb_w))/(data$adverb_h+data$adverb_w+.0001))
data$lsm_conj_hw=1-((abs(data$conj_h-data$conj_w))/(data$conj_h+data$conj_w+.0001))
data$lsm_preps_hw=1-((abs(data$preps_h-data$preps_w))/(data$preps_h+data$preps_w+.0001))
data$lsm_neg_hw=1-((abs(data$negate_h-data$negate_w))/(data$negate_h+data$negate_w+.0001))
data$lsm_quant_hw=1-((abs(data$quant_h-data$quant_w))/(data$quant_h+data$quant_w+.0001))
 
#average those nine LSM scores (or fewer if you opt to omit some categories)
data$lsm_hw=(data$lsm_ppron_hw+data$lsm_ipron_hw+data$lsm_art_hw+data$lsm_aux_hw+data$lsm_adv_hw+data$lsm_conj_hw+data$lsm_preps_hw+data$lsm_neg_hw+data$lsm_quant_hw)/9