## this example shows how to compare one vector of data to another. You may want ## to make lots of these comparisons and in specific ways (e.g., comparing ## speakers within a conversation) - this becomes a specific data management ## problem (how to efficiently perform operations over a particular dataset). ## For more on making these more complete comparison, see ## miserman.github.io/lingmatch/#comparisons # this function will calculate Canberra similarity between each value in a vector: canberra = function(a, b) 1 - abs(a - b) / (a + b + .0001) # these are the standard LSM LIWC categories: cats = c('ppron','ipron','article','adverb','conj','prep','auxverb','negate','quant') # say you had a data object with these LIWC categories in columns, for example: data = matrix(rpois(length(cats)*2, 20)/10, 2, length(cats), dimnames=list(c(), cats)) # you could compare rows using the canberra function: lsm = canberra(data[1, cats], data[2, cats]) # then, if you want a single LSM score, you can average across categories. # The as.numeric part accounts for data stored as text: lsm = mean(as.numeric(lsm)) # you might alternatively have sources in the same row, differentiated by category name: cats = c(cats, paste0(cats,'_2')) data = matrix(rpois(length(cats)*2, 20)/10, 2, length(cats), dimnames=list(c(), cats)) # in which case you could compare within the row, over multiple rows: lsm = canberra(data[, cats[1:9]], data[, cats[10:18]]) # if you have more than one row, they can be aggregated with rowMeans: lsm = rowMeans(lsm) ##finally, a more explicit version that may be more usable for people who use R for statistics but are not familiar with programming! #data should be set up in a "wide" format, so that every row represents a dyad #partners in each dyad should have LIWC results organized side-by-side #LIWC categories should be labeled to indicate which person the results belong to; for example, "_w" for wife #calculate each category-level LSM score data$lsm_ppron_hw=1-((abs(data$ppron_h-data$ppron_w))/(data$ppron_h+data$ppron_w+.0001)) data$lsm_ipron_hw=1-((abs(data$ipron_h-data$ipron_w))/(data$ipron_h+data$ipron_w+.0001)) data$lsm_art_hw=1-((abs(data$article_h-data$article_w))/(data$article_h+data$article_w+.0001)) data$lsm_aux_hw=1-((abs(data$auxverb_h-data$auxverb_w))/(data$auxverb_h+data$auxverb_w+.0001)) data$lsm_adv_hw=1-((abs(data$adverb_h-data$adverb_w))/(data$adverb_h+data$adverb_w+.0001)) data$lsm_conj_hw=1-((abs(data$conj_h-data$conj_w))/(data$conj_h+data$conj_w+.0001)) data$lsm_preps_hw=1-((abs(data$preps_h-data$preps_w))/(data$preps_h+data$preps_w+.0001)) data$lsm_neg_hw=1-((abs(data$negate_h-data$negate_w))/(data$negate_h+data$negate_w+.0001)) data$lsm_quant_hw=1-((abs(data$quant_h-data$quant_w))/(data$quant_h+data$quant_w+.0001)) #average those nine LSM scores (or fewer if you opt to omit some categories) data$lsm_hw=(data$lsm_ppron_hw+data$lsm_ipron_hw+data$lsm_art_hw+data$lsm_aux_hw+data$lsm_adv_hw+data$lsm_conj_hw+data$lsm_preps_hw+data$lsm_neg_hw+data$lsm_quant_hw)/9