if(!'devtools'%in%installed.packages()) tryCatch(install.packages('devtools'), error = function(e) warning(e$message, call. = FALSE)) splot.update=function(...) tryCatch(devtools::install_github('miserman/splot',...), error = function(e) warning(e$message, call. = FALSE)) lingmatch.update=function(...) tryCatch(devtools::install_github('miserman/lingmatch',...), error = function(e) warning(e$message, call. = FALSE)) if(!'splot'%in%installed.packages()) splot.update() if(!'lingmatch'%in%installed.packages()) lingmatch.update() require(splot, quietly=TRUE) require(lingmatch, quietly=TRUE) message('LUSI lab library loaded: partial documentation at www.depts.ttu.edu/psy/lusi/resources.php development versions of splot and lingmatch were also loaded; use splot.update() or lingmatch.update() to get the latest versions, and see ?splot or ?lingmatch for help.') single_reviews=function(title,pages,write=TRUE,path=getwd(),filename){ url=paste0('https://www.rottentomatoes.com/m/',title,'/reviews/?page=1&type=user') tt=tryCatch(readLines(url,warn=FALSE),error=function(e)NULL) if(is.null(tt)) return(tt) op=data.frame() np=max(1,as.integer(gsub('[^0-9]','', gsub('^.*Page \\d+ of | np)) pages=np cat('\n\npulling reviews for',title,'\n') for(p in seq_len(pages)){ if(p!=1){ url=paste0('https://www.rottentomatoes.com/m/',title,'/reviews/?page=',p,'&type=user') tt=tryCatch(readLines(url,warn=FALSE),error=function(e)NULL) } if(is.null(tt)) next cat('\rpage',p,'of',pages) revs=strsplit(tt[grep('review_table',tt,fixed=TRUE)],'
',fixed=TRUE)[[1]][-1] user=as.integer(sub('[^0-9]','',sub('\\/.*','',sub('^.*\\/user\\/id\\/','',revs)))) super_reviewer=grepl('Super Reviewer',revs,fixed=TRUE) rating=unlist(lapply(strsplit(revs,'span',fixed=TRUE),function(r)length(grep('glyphicon-star',r,fixed=TRUE)))) if(length(half<-grep('½',revs,fixed=TRUE))!=0) rating[half]=rating[half]+.5 date=sub('<.*','',sub('^.*subtle\\">','',revs)) text=sub('
.*','',sub('^.* ','',revs)) text=gsub(''',"'",text,fixed=TRUE) text=gsub('"','"',text,fixed=TRUE) text=gsub('<[^>]*>',' ',text) op=rbind(op,cbind(user,super_reviewer,user_profile=paste0('https://www.rottentomatoes.com/user/id/',user), rating,text,date,url=rep(url,length(user)))) } if(nrow(op)!=0 && write || !missing(path) || !missing(filename)){ if(missing(filename)) filename=paste0(title,'_reviews') path=paste0(sub('/+$','',path),'/',filename,'.csv') tryCatch(write.csv(op,path,row.names=FALSE), error=function(e)warning('failed to save file: ',e$message,call.=FALSE)) cat('\nfile saved to',path) } invisible(op) } movie_reviews=function(titles,pages=NULL,write=TRUE,path=getwd(),filename){ op=data.frame() for(t in titles){ tt=tryCatch(single_reviews(t,pages,FALSE),error=function(e){ warning('failed to pull reviews for ',t,': ',e$message,call.=FALSE) NULL }) if(is.null(tt)) next tt$title=rep(t,nrow(tt)) op=rbind(op,tt) } if(nrow(op)!=0 && write || !missing(path) || !missing(filename)){ if(missing(filename)) filename=paste0(if(length(titles)==1) titles else 'movie','_reviews_',Sys.Date()) path=paste0(sub('/+$','',path),'/',filename,'.csv') tryCatch(write.csv(op,path,row.names=FALSE), error=function(e)warning('failed to save file: ',e$message,call.=FALSE)) cat('\n\nfile saved to',path) } invisible(op) } movie_ratings_links = function(titles, add = '', retry = TRUE, search_source = 'https://search.yahoo.com/search?q='){ r = q = list() rt = NULL al = length(add) tl = length(titles) if(al < tl) add[seq(al + 1, tl)] = '' else if(al != tl) add = add[seq_len(tl)] names(add) = titles gs = function(title, add, sites = c('imdb', 'rottentomatoes', 'metacritic')){ tryCatch({ s = readLines(paste0(search_source, if(length(sites) == 1) paste0(if(!grepl('\\W', title)) 'movie', '+', sites) else 'movie+reviews', if(add == '') paste0('+"', title, '"') else paste0('+', add, '+"', title, '"')), warn = FALSE) s = grep('https://www.', paste(s, collapse = ' '), fixed = TRUE, value = TRUE) s = unique(regmatches(s, gregexpr(paste0('https://www\\.(', paste(sites, collapse = '|'), ')\\.com/(title|m|movie)/[^ &?"%Budget:', p, fixed = TRUE, value = TRUE) opening = grep('

Opening Weekend USA:

', p, fixed = TRUE, value = TRUE) gross = grep('

Gross USA:

', p, fixed = TRUE, value = TRUE) title = grep('

', p, fixed = TRUE, value = TRUE) p = jsonlite::fromJSON(paste0('{', paste( p[seq(grep('', p)[1] - 1)], collapse = ' '), '}')) duration = if(is.null(p$duration)) NA else strsplit(p$duration, '', TRUE)[[1]] dl = length(duration) data.frame( url = urls[[i]], title = if(length(title)) sub('^<.*?>', '', sub('&n.*$', '', title)) else NA, original_title = if(is.null(p$name)) NA else p$name, author = if(is.null(p$creator$name) || (is.null(p$creator) && 'Person' %in% p$creator$`@type`)) NA else paste(unique(p$creator[p$creator$`@type` == 'Person', 'name']), collapse = ', '), director = if(is.null(p$director)) NA else paste(unique(p$director$name), collapse = ', '), genre = paste(p$genre, collapse = ', '), content_rating = if(is.null(p$contentRating)) NA else p$contentRating, date = if(is.null(p$datePublished)) NA else p$datePublished, hours = if(dl == 1) NA else if(duration[dl] == 'M'){ as.numeric(gsub('[^0-9]', '', paste(duration, collapse = ''))) / 60 }else as.numeric(paste0(duration[3])) + if(length(duration) > 4) as.numeric(gsub('[^0-9]', '', paste(duration[5:6], collapse = ''))) / 60 else 0, rating = if(is.null(p$aggregateRating$ratingValue)) NA else as.numeric(p$aggregateRating$ratingValue), nusers = if(is.null(p$aggregateRating$ratingValue)) NA else as.numeric(p$aggregateRating$ratingCount), budget = if(length(bo)) gsub('^[^$]+|[$,]', '', bo) else NA, opening = if(length(opening)) gsub('^[^$]+|[$,]', '', opening) else NA, usgross = if(length(gross)) gsub('^[^$]+|, <.*$|[$,]', '', gross) else NA ) }, error = function(e) rep(NA, ncol(r))) r } rottentomatoes_meta = function(urls){ r = data.frame( url = character(), title = character(), author = character(), director = character(), critic_average = numeric(), critic_rating = numeric(), critic_n = numeric(), critic_fresh = numeric(), critic_rotten = numeric(), user_average = numeric(), user_n = numeric(), user_percliked = numeric(), content_rating = character(), genres = character(), date = character(), boxoffice = character(), runtime = character() ) for(i in seq_along(urls)) r[i, ] = tryCatch({ p = readLines(urls[[i]], warn = FALSE) st = grep('', '', p[c(st, st + 1)])) si = jsonlite::fromJSON(gsub('.*= |;$', '', grep('.scoreInfo =', p, fixed = TRUE, value = TRUE))) meta = grep('