Text Analytics Using R - Part C: Extraction of Samsung S4 white reviews from Amazon.com and Amazon. India
Hi Folks!!
Welcome to the 3rd part of the series where we would be extracting the reviewers name , date , rating and review for the product galaxy s4
The given below is the code to extract for the .com and .in and combining them into the new datafile
Code:
--library(RCurl)
--library(XML)
--library(rvest)
--init <- "http://www.amazon.in/Samsung-Galaxy-S4-GT-I9500-White/product reviews/B00CL4HXQC"
--crawlCandicate = "ref=cm_cr_pr_btm_link_"
--base <- "http://www.amazon.in"
--num <- 3
--doclist <- list()
--anchorlist <- vector()
--j <- 0
--while(j<num){
--if(j==0){
--doclist[j+1] <- getURL(init)
--} else{
--doclist[j+1] <- getURL(paste(base,anchorlist[j+1],sep = ""))
--}
-- doc <- htmlParse(doclist[[j+1]])
--anchor <- getNodeSet(doc,"//a") # capture all the 'a' tags which contains all the
--anchor <- sapply(anchor,function(x) xmlGetAttr(x,"href"))
--anchor <- anchor[grep(crawlCandicate,anchor)]
-- anchorlist <- c(anchorlist,anchor)
--anchorlist <- unique(anchorlist)
--j <- j + 1
--}
--reviews <- c()
--ratings=c()
--reviewers = c()
--date_of_review = c()
--for (i in 1:3)
--{
--doc <- htmlParse(doclist[[i]])
--l = getNodeSet(doc,"//div/span[@class='a-size-base review-text']")
--l1 <- sapply(l,xmlValue)
--rateNodes=getNodeSet(doc,"//div/a/i/span[@class='a-icon-alt']")
-- rates=sapply(rateNodes,xmlValue)
--reviewerNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary reviewbyline']")
--reviewer=sapply(reviewerNodes,xmlValue)
--dateNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-date']")
--dates=sapply(dateNodes,xmlValue)
--ratings=c(ratings,rates)
--reviews <- c(reviews,l1)
--reviewers <- c(reviewers,reviewer)
--date_of_review <- c(date_of_review,dates)
--}
--reviews.data <- data.frame(reviews,site = "Amazon.in")
--ratings.data <- data.frame(ratings,site = "Amazon.in")
--reviewers.data <- data.frame(reviewers,site = "Amazon.in")
--date_of_review.data <- data.frame(date_of_review,site = "Amazon.in")
--Amazon_IN <- data.frame(reviews = reviews.data[1:30,],
ratings = ratings.data[1:30,-2],
reviewers = reviewers.data[1:30,-2],
date_of_review = date_of_review.data[1:30,-2])
#Amazon.com reviews
--library(RCurl)
--library(XML)
--library(rvest)
--init <- "http://www.amazon.in/Samsung-Galaxy-S4-GT-I9500-White/product-reviews/B00CL4HXQC"
--crawlCandicate = "pageNumber="
--base <- "http://www.amazon.com"
--num <- 3
--doclist <- list()
--anchorlist <- vector()
--j <- 0
--while(j<num){
--if(j==0){
--doclist[j+1] <- getURL(init)
--} else{
--doclist[j+1] <- getURL(paste(base,anchorlist[j+1],sep = ""))
--}
-- doc <- htmlParse(doclist[[j+1]])
--anchor <- getNodeSet(doc,"//a")
--anchor <- sapply(anchor,function(x) xmlGetAttr(x,"href"))
--anchor <- anchor[grep(crawlCandicate,anchor)
--anchorlist <- c(anchorlist,anchor)
--anchorlist <- unique(anchorlist)
--j <- j + 1
--}
--reviews <- c()
--ratings=c()
--reviewers = c()
--date_of_review = c()
--for (i in 1:3)
--{
--doc <- htmlParse(doclist[[i]])
--l = getNodeSet(doc,"//div/span[@class='a-size-base review-text']")
-- l1 <- sapply(l,xmlValue)
--rateNodes=getNodeSet(doc,"//div/a/i/span[@class='a-icon-alt']")
--rates=sapply(rateNodes,xmlValue)
--reviewerNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-byline']")
-- reviewer=sapply(reviewerNodes,xmlValue)
--dateNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-date']")
--dates=sapply(dateNodes,xmlValue)
--ratings=c(ratings,rates)
--reviews <- c(reviews,l1)
--reviewers <- c(reviewers,reviewer)
--date_of_review <- c(date_of_review,dates)
--}reviews.data <- data.frame(reviews,site = "Amazon.com")
--ratings.data <- data.frame(ratings,site = "Amazon.com")
--reviewers.data <- data.frame(reviewers,site = "Amazon.com")
--date_of_review.data <- data.frame(date_of_review,site = "Amazon.com")
--Amazon_COM <- data.frame(reviews = reviews.data[1:30,],
ratings = ratings.data[1:30,-2],
reviewers = reviewers.data[1:30,-2],
date_of_review = date_of_review.data[1:30,-2])
--Amazon_ALL=rbind(Amazon_IN,Amazon_COM)
Explanation:Majority of the code has been explained in Part A and Part B of the blog. Here we have attached the code for the amazon.in and amazon.com where we can get the respective reviews
Welcome to the 3rd part of the series where we would be extracting the reviewers name , date , rating and review for the product galaxy s4
The given below is the code to extract for the .com and .in and combining them into the new datafile
Code:
--library(RCurl)
--library(XML)
--library(rvest)
--init <- "http://www.amazon.in/Samsung-Galaxy-S4-GT-I9500-White/product reviews/B00CL4HXQC"
--crawlCandicate = "ref=cm_cr_pr_btm_link_"
--base <- "http://www.amazon.in"
--num <- 3
--doclist <- list()
--anchorlist <- vector()
--j <- 0
--while(j<num){
--if(j==0){
--doclist[j+1] <- getURL(init)
--} else{
--doclist[j+1] <- getURL(paste(base,anchorlist[j+1],sep = ""))
--}
-- doc <- htmlParse(doclist[[j+1]])
--anchor <- getNodeSet(doc,"//a") # capture all the 'a' tags which contains all the
--anchor <- sapply(anchor,function(x) xmlGetAttr(x,"href"))
--anchor <- anchor[grep(crawlCandicate,anchor)]
-- anchorlist <- c(anchorlist,anchor)
--anchorlist <- unique(anchorlist)
--j <- j + 1
--}
--reviews <- c()
--ratings=c()
--reviewers = c()
--date_of_review = c()
--for (i in 1:3)
--{
--doc <- htmlParse(doclist[[i]])
--l = getNodeSet(doc,"//div/span[@class='a-size-base review-text']")
--l1 <- sapply(l,xmlValue)
--rateNodes=getNodeSet(doc,"//div/a/i/span[@class='a-icon-alt']")
-- rates=sapply(rateNodes,xmlValue)
--reviewerNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary reviewbyline']")
--reviewer=sapply(reviewerNodes,xmlValue)
--dateNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-date']")
--dates=sapply(dateNodes,xmlValue)
--ratings=c(ratings,rates)
--reviews <- c(reviews,l1)
--reviewers <- c(reviewers,reviewer)
--date_of_review <- c(date_of_review,dates)
--}
--reviews.data <- data.frame(reviews,site = "Amazon.in")
--ratings.data <- data.frame(ratings,site = "Amazon.in")
--reviewers.data <- data.frame(reviewers,site = "Amazon.in")
--date_of_review.data <- data.frame(date_of_review,site = "Amazon.in")
--Amazon_IN <- data.frame(reviews = reviews.data[1:30,],
ratings = ratings.data[1:30,-2],
reviewers = reviewers.data[1:30,-2],
date_of_review = date_of_review.data[1:30,-2])
#Amazon.com reviews
--library(RCurl)
--library(XML)
--library(rvest)
--init <- "http://www.amazon.in/Samsung-Galaxy-S4-GT-I9500-White/product-reviews/B00CL4HXQC"
--crawlCandicate = "pageNumber="
--base <- "http://www.amazon.com"
--num <- 3
--doclist <- list()
--anchorlist <- vector()
--j <- 0
--while(j<num){
--if(j==0){
--doclist[j+1] <- getURL(init)
--} else{
--doclist[j+1] <- getURL(paste(base,anchorlist[j+1],sep = ""))
--}
-- doc <- htmlParse(doclist[[j+1]])
--anchor <- getNodeSet(doc,"//a")
--anchor <- sapply(anchor,function(x) xmlGetAttr(x,"href"))
--anchor <- anchor[grep(crawlCandicate,anchor)
--anchorlist <- c(anchorlist,anchor)
--anchorlist <- unique(anchorlist)
--j <- j + 1
--}
--reviews <- c()
--ratings=c()
--reviewers = c()
--date_of_review = c()
--for (i in 1:3)
--{
--doc <- htmlParse(doclist[[i]])
--l = getNodeSet(doc,"//div/span[@class='a-size-base review-text']")
-- l1 <- sapply(l,xmlValue)
--rateNodes=getNodeSet(doc,"//div/a/i/span[@class='a-icon-alt']")
--rates=sapply(rateNodes,xmlValue)
--reviewerNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-byline']")
-- reviewer=sapply(reviewerNodes,xmlValue)
--dateNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-date']")
--dates=sapply(dateNodes,xmlValue)
--ratings=c(ratings,rates)
--reviews <- c(reviews,l1)
--reviewers <- c(reviewers,reviewer)
--date_of_review <- c(date_of_review,dates)
--}reviews.data <- data.frame(reviews,site = "Amazon.com")
--ratings.data <- data.frame(ratings,site = "Amazon.com")
--reviewers.data <- data.frame(reviewers,site = "Amazon.com")
--date_of_review.data <- data.frame(date_of_review,site = "Amazon.com")
--Amazon_COM <- data.frame(reviews = reviews.data[1:30,],
ratings = ratings.data[1:30,-2],
reviewers = reviewers.data[1:30,-2],
date_of_review = date_of_review.data[1:30,-2])
--Amazon_ALL=rbind(Amazon_IN,Amazon_COM)
Explanation:Majority of the code has been explained in Part A and Part B of the blog. Here we have attached the code for the amazon.in and amazon.com where we can get the respective reviews
Comments
Post a Comment