Text Analytics Using R - Part C: Extraction of Samsung S4 white reviews from Amazon.com and Amazon. India

Hi Folks!!

Welcome to the 3rd part of the series where we would be extracting the reviewers name , date , rating and review for the product galaxy s4

The given below is the code to extract for the .com and .in and combining them into the new datafile

Code:
--library(RCurl)
--library(XML)
--library(rvest)
--init <- "http://www.amazon.in/Samsung-Galaxy-S4-GT-I9500-White/product  reviews/B00CL4HXQC"
--crawlCandicate = "ref=cm_cr_pr_btm_link_"
--base <- "http://www.amazon.in"
--num <- 3
--doclist <- list()
--anchorlist <- vector()
--j <- 0
--while(j<num){
--if(j==0){
--doclist[j+1] <- getURL(init)
--} else{
--doclist[j+1] <- getURL(paste(base,anchorlist[j+1],sep = ""))
--}
 -- doc <- htmlParse(doclist[[j+1]])
  --anchor <- getNodeSet(doc,"//a") # capture all the 'a' tags which contains all the 
  --anchor <- sapply(anchor,function(x) xmlGetAttr(x,"href")) 
--anchor <- anchor[grep(crawlCandicate,anchor)] 
 -- anchorlist <- c(anchorlist,anchor)
  --anchorlist <- unique(anchorlist)
  --j <- j + 1
--}

--reviews <- c() 
--ratings=c()
--reviewers = c()
--date_of_review = c()
--for (i in 1:3)
--{
  --doc <- htmlParse(doclist[[i]])
  --l = getNodeSet(doc,"//div/span[@class='a-size-base review-text']")
  --l1 <- sapply(l,xmlValue)
  --rateNodes=getNodeSet(doc,"//div/a/i/span[@class='a-icon-alt']")
 -- rates=sapply(rateNodes,xmlValue)
  --reviewerNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary reviewbyline']")
  --reviewer=sapply(reviewerNodes,xmlValue)
  --dateNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-date']")
  --dates=sapply(dateNodes,xmlValue)
  --ratings=c(ratings,rates)
  --reviews <- c(reviews,l1)
  --reviewers <- c(reviewers,reviewer)
  --date_of_review <- c(date_of_review,dates)
--}

--reviews.data <- data.frame(reviews,site = "Amazon.in")
--ratings.data <- data.frame(ratings,site = "Amazon.in")
--reviewers.data <- data.frame(reviewers,site = "Amazon.in")
--date_of_review.data <- data.frame(date_of_review,site = "Amazon.in")
--Amazon_IN <- data.frame(reviews = reviews.data[1:30,],
                        ratings = ratings.data[1:30,-2],
                        reviewers = reviewers.data[1:30,-2],
                        date_of_review = date_of_review.data[1:30,-2])
#Amazon.com reviews
--library(RCurl)
--library(XML)
--library(rvest)
--init <- "http://www.amazon.in/Samsung-Galaxy-S4-GT-I9500-White/product-reviews/B00CL4HXQC"
--crawlCandicate = "pageNumber="
--base <- "http://www.amazon.com"
--num <- 3
--doclist <- list()
--anchorlist <- vector()
--j <- 0
--while(j<num){
  --if(j==0){
    --doclist[j+1] <- getURL(init)
    --} else{
    --doclist[j+1] <- getURL(paste(base,anchorlist[j+1],sep = ""))
    --}
 -- doc <- htmlParse(doclist[[j+1]])
  --anchor <- getNodeSet(doc,"//a") 
  --anchor <- sapply(anchor,function(x) xmlGetAttr(x,"href")) 
  --anchor <- anchor[grep(crawlCandicate,anchor)
  --anchorlist <- c(anchorlist,anchor)
  --anchorlist <- unique(anchorlist)
  --j <- j + 1
--}
--reviews <- c() 
--ratings=c()
--reviewers = c()
--date_of_review = c()
--for (i in 1:3)
--{
 --doc <- htmlParse(doclist[[i]])
--l = getNodeSet(doc,"//div/span[@class='a-size-base review-text']")
-- l1 <- sapply(l,xmlValue)
 --rateNodes=getNodeSet(doc,"//div/a/i/span[@class='a-icon-alt']")
 --rates=sapply(rateNodes,xmlValue)
 --reviewerNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-byline']")
 -- reviewer=sapply(reviewerNodes,xmlValue)
  --dateNodes = getNodeSet(doc,"//div/span[@class='a-size-base a-color-secondary review-date']")
  --dates=sapply(dateNodes,xmlValue)
  --ratings=c(ratings,rates)
  --reviews <- c(reviews,l1)
  --reviewers <- c(reviewers,reviewer)
  --date_of_review <- c(date_of_review,dates)
--}reviews.data <- data.frame(reviews,site = "Amazon.com")
--ratings.data <- data.frame(ratings,site = "Amazon.com")
--reviewers.data <- data.frame(reviewers,site = "Amazon.com")
--date_of_review.data <- data.frame(date_of_review,site = "Amazon.com")
--Amazon_COM <- data.frame(reviews = reviews.data[1:30,],
                         ratings = ratings.data[1:30,-2],
                         reviewers = reviewers.data[1:30,-2],
                         date_of_review = date_of_review.data[1:30,-2])
--Amazon_ALL=rbind(Amazon_IN,Amazon_COM)

Explanation:Majority of the code has been explained in Part A and Part B of the blog. Here we have attached the code for the amazon.in and amazon.com where we can get the respective reviews











Comments

Popular posts from this blog

Kabaddi Match: Lets meet at the arena!! Aa jao Dam Dikhane!!!

Text Analytics Using R - Part A: Extraction of reviews of galaxy s4 product reviews in flipkart

Replace your Social Media with the New Age Social Media: Fitness with Motivation