How to create XML Sitemap with R

XML sitemaps are must have when you want to make sure Google crawl your pages. To create XML sitemaps, it exists multiple free and paid solutions. Well, this article will show you how to create XML sitemaps for free with a simple R script. This method does not need to spend money, It just asks some knowledge about R script for SEO. If you are more confortable with Python Script, please check out our last article about how to create XML sitemap with Python. Obviously we can automate this process with a contrab tab and run it every month in order to have updated sitemaps. I will demonstrate it in next article 😉

Enough talks, let’s get start coding. For this exemple, we decided use Moz Blog to experiment our script.

What are we going to do here ? 🧐

  1. Crawl your website
  2. Get SEO friendly urls
  3. Classify URLS by page type
  4. Create Sitemaps

Follow the steps 🙈

Step #1 : Load librairies you need

# Define your working directory
setwd("~/Desktop/xxxx/xxxxx")

# Install librairies if they are not installed yet
packages <- c("Rcrawler","dplyr", "stringr","xml2","tidyverse","tidyr")

if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())))  
}
# Load packages
library(Rcrawler)
library(dplyr)
library(stringr)
library(xml2)
library(tidyverse)
library(tidyr)

Step #2: Start a crawl

# I define what I need to scrap during crawl
CustomLabels <- c("canonical_tag", "meta_robots")
CustomXPaths <- c("//link[@rel='canonical']/@href", "//meta[@name='robots']/@content")

# Start the crawl right here
Rcrawler(Website = "https://moz.com/blog", 
         ExtractXpathPat = CustomXPaths, 
         PatternsNames = CustomLabels)

Step #3: Let’s clean data and get SEO friendly urls

# Time to merge data from crawl and scraping
crawl <-data.frame(do.call("rbind", DATA))
crawl_complete <- cbind(INDEX,crawl)
Idurl = as.numeric(crawl_complete$Id)
crawl_complete = cbind(Idurl,crawl_complete)
       
# Now, let's get only SEO friendly ursl
crawl_complete <- mutate(crawl_complete, Canonical_indexability = ifelse(Url == canonical_tag | is.na(crawl_complete$canonical_tag), "Canonical Matching", "Canonical not Matching")) 
crawl_complete <- mutate(crawl_complete, indexation = ifelse(grepl("NOINDEX|noindex", crawl_complete$meta_robots), "Non Indexable", "Indexable")) 
crawl_complete_indexable_pages <- filter(crawl_complete, indexation =="Indexable" & Canonical_indexability == "Canonical Matching" )

# Then, I just need urls for next steps
column_to_keep_sc <- c("Url")
crawl_complete_indexable_pages = INDEX[column_to_keep_sc]

Step #4: Group urls by pagetype

           

# Let's now, split urls by page type and create data frames
crawl <- mutate(crawl_complete_indexable_pages, category = ifelse(str_detect(crawl_complete_indexable_pages$Url,"/blog/"), "Articles",
                                         ifelse(str_detect(crawl_complete_indexable_pages$Url, "/learn/"), "Learn pages",
                                                ifelse(str_detect(crawl_complete_indexable_pages$Url, "Products"),"Products", "Community"))))

# Create dataframes that will become our sitemaps
sitemap_1 <- filter(crawl, category =="Articles")
column_to_keep_sc <- c("Url")
sitemap_articles = sitemap_1[column_to_keep_sc]

sitemap_2 <- filter(crawl, category =="Learn pages")
column_to_keep_sc <- c("Url")
sitemap_learn = sitemap_2[column_to_keep_sc]

sitemap_3 <- filter(crawl, category =="Products")
column_to_keep_sc <- c("Url")
sitemap_products = sitemap_3[column_to_keep_sc]

Step #5: Start creating your XML sitemaps

# Create XML sitemap template
require(whisker)
require(httr)
tpl <- '
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 {{#links}}
   <url>
      <loc>{{{loc}}}</loc>
      <lastmod>{{{lastmod}}}</lastmod>
      <changefreq>{{{changefreq}}}</changefreq>
      <priority>{{{priority}}}</priority>
   </url>
 {{/links}}
</urlset>
'
links = as.character(sitemap_articles$Url)

map_links <- function(l) {
  tmp <- GET(l)
  d <- tmp$headers[['last-modified']]
  
  list(loc=l,
       #lastmod=format(as.Date(d,format="%Y.%m.%d")),
       lastmod=format(Sys.time(), "%Y-%m-%d"),
       changefreq="monthly",
       priority="0.8")
}

print(map_links)
links <- lapply(links, map_links)
print(links)
cat(whisker.render(tpl),file="sitemap_article_moz.xml")


# Start again for each sitemap

links = as.character(sitemap_learn$Url)

map_links <- function(l) {
  tmp <- GET(l)
  d <- tmp$headers[['last-modified']]
  
  list(loc=l,
       #lastmod=format(as.Date(d,format="%Y.%m.%d")),
       lastmod=format(Sys.time(), "%Y-%m-%d"),
       changefreq="monthly",
       priority="0.8")
}

print(map_links)
links <- lapply(links, map_links)
print(links)
cat(whisker.render(tpl),file="sitemap_learn_moz.xml")

Full script  🤙

setwd("~/Desktop/RSTUDIO/SITEMAP")

# Je charge dans une variable la liste des librairies que je vais utiliser
packages <- c("Rcrawler","dplyr", "stringr","xml2","tidyverse","tidyr")

# fonction permettant d'installer automatiquement les librairies n?cessaires & non install?es
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())))  
}
library(Rcrawler)
library(dplyr)
library(stringr)
library(xml2)
library(tidyverse)
library(tidyr)


#1 - I SET UP MY CRAWL
CustomLabels <- c("canonical_tag", "meta_robots")

CustomXPaths <- c("//link[@rel='canonical']/@href", "//meta[@name='robots']/@content")

#2 - WE START CRAWLING 
Rcrawler(Website = "https://moz.com/blog", 
         ExtractXpathPat = CustomXPaths, 
         PatternsNames = CustomLabels)

#3 - WE MERGE DATA
crawl <-data.frame(do.call("rbind", DATA))
crawl_complete <- cbind(INDEX,crawl)
Idurl = as.numeric(crawl_complete$Id)
crawl_complete = cbind(Idurl,crawl_complete)
       
#2 - I IDENTIFY INDEXABLE PAGES
crawl_complete <- mutate(crawl_complete, Canonical_indexability = ifelse(Url == canonical_tag | is.na(crawl_complete$canonical_tag), "Canonical Matching", "Canonical not Matching")) 
crawl_complete <- mutate(crawl_complete, indexation = ifelse(grepl("NOINDEX|noindex", crawl_complete$meta_robots), "Non Indexable", "Indexable")) 

#3 - I KEEP ONLY INDEXABLE PAGES
crawl_complete_indexable_pages <- filter(crawl_complete, indexation =="Indexable" & Canonical_indexability == "Canonical Matching" )

#4 - I KEEP ONLY URLS
column_to_keep_sc <- c("Url")
crawl_complete_indexable_pages = INDEX[column_to_keep_sc]


#5 - I SPLIT BY PAGE TYPE (NOT NECESSARY)
crawl <- mutate(crawl_complete_indexable_pages, category = ifelse(str_detect(crawl_complete_indexable_pages$Url,"/blog/"), "Articles",
                                         ifelse(str_detect(crawl_complete_indexable_pages$Url, "/learn/"), "Learn pages",
                                                ifelse(str_detect(crawl_complete_indexable_pages$Url, "Products"),"Products", "Community"))))
#6 - I CREATE SEVERAL FILES 
sitemap_1 <- filter(crawl, category =="Articles")
column_to_keep_sc <- c("Url")
sitemap_articles = sitemap_1[column_to_keep_sc]

sitemap_2 <- filter(crawl, category =="Learn pages")
column_to_keep_sc <- c("Url")
sitemap_learn = sitemap_2[column_to_keep_sc]

sitemap_3 <- filter(crawl, category =="Products")
column_to_keep_sc <- c("Url")
sitemap_products = sitemap_3[column_to_keep_sc]



#4 - FROM THESE FILES, I CREATE SITEMAPS 
require(whisker)
require(httr)
tpl <- '
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 {{#links}}
   <url>
      <loc>{{{loc}}}</loc>
      <lastmod>{{{lastmod}}}</lastmod>
      <changefreq>{{{changefreq}}}</changefreq>
      <priority>{{{priority}}}</priority>
   </url>
 {{/links}}
</urlset>
'
links = as.character(sitemap_articles$Url)


map_links <- function(l) {
  tmp <- GET(l)
  d <- tmp$headers[['last-modified']]
  
  list(loc=l,
       #lastmod=format(as.Date(d,format="%Y.%m.%d")),
       lastmod=format(Sys.time(), "%Y-%m-%d"),
       changefreq="monthly",
       priority="0.8")
}

print(map_links)
links <- lapply(links, map_links)
print(links)
cat(whisker.render(tpl),file="sitemap_articles_moz.xml")



Laisser un commentaire