XML sitemaps are must have when you want to make sure Google crawl your pages. To create XML sitemaps, it exists multiple free and paid solutions. Well, this article will show you how to create XML sitemaps for free with a simple R script. This method does not need to spend money, It just asks some knowledge about R script for SEO. If you are more confortable with Python Script, please check out our last article about how to create XML sitemap with Python. Obviously we can automate this process with a contrab tab and run it every month in order to have updated sitemaps. I will demonstrate it in next article 😉
Enough talks, let’s get start coding. For this exemple, we decided use Moz Blog to experiment our script.
What are we going to do here ? 🧐
- Crawl your website
- Get SEO friendly urls
- Classify URLS by page type
- Create Sitemaps
Follow the steps 🙈
Step #1 : Load librairies you need
# Define your working directory setwd("~/Desktop/xxxx/xxxxx") # Install librairies if they are not installed yet packages <- c("Rcrawler","dplyr", "stringr","xml2","tidyverse","tidyr") if (length(setdiff(packages, rownames(installed.packages()))) > 0) { install.packages(setdiff(packages, rownames(installed.packages()))) } # Load packages library(Rcrawler) library(dplyr) library(stringr) library(xml2) library(tidyverse) library(tidyr)
Step #2: Start a crawl
# I define what I need to scrap during crawl CustomLabels <- c("canonical_tag", "meta_robots") CustomXPaths <- c("//link[@rel='canonical']/@href", "//meta[@name='robots']/@content") # Start the crawl right here Rcrawler(Website = "https://moz.com/blog", ExtractXpathPat = CustomXPaths, PatternsNames = CustomLabels)
Step #3: Let’s clean data and get SEO friendly urls
# Time to merge data from crawl and scraping crawl <-data.frame(do.call("rbind", DATA)) crawl_complete <- cbind(INDEX,crawl) Idurl = as.numeric(crawl_complete$Id) crawl_complete = cbind(Idurl,crawl_complete) # Now, let's get only SEO friendly ursl crawl_complete <- mutate(crawl_complete, Canonical_indexability = ifelse(Url == canonical_tag | is.na(crawl_complete$canonical_tag), "Canonical Matching", "Canonical not Matching")) crawl_complete <- mutate(crawl_complete, indexation = ifelse(grepl("NOINDEX|noindex", crawl_complete$meta_robots), "Non Indexable", "Indexable")) crawl_complete_indexable_pages <- filter(crawl_complete, indexation =="Indexable" & Canonical_indexability == "Canonical Matching" ) # Then, I just need urls for next steps column_to_keep_sc <- c("Url") crawl_complete_indexable_pages = INDEX[column_to_keep_sc]
Step #4: Group urls by pagetype
# Let's now, split urls by page type and create data frames crawl <- mutate(crawl_complete_indexable_pages, category = ifelse(str_detect(crawl_complete_indexable_pages$Url,"/blog/"), "Articles", ifelse(str_detect(crawl_complete_indexable_pages$Url, "/learn/"), "Learn pages", ifelse(str_detect(crawl_complete_indexable_pages$Url, "Products"),"Products", "Community")))) # Create dataframes that will become our sitemaps sitemap_1 <- filter(crawl, category =="Articles") column_to_keep_sc <- c("Url") sitemap_articles = sitemap_1[column_to_keep_sc] sitemap_2 <- filter(crawl, category =="Learn pages") column_to_keep_sc <- c("Url") sitemap_learn = sitemap_2[column_to_keep_sc] sitemap_3 <- filter(crawl, category =="Products") column_to_keep_sc <- c("Url") sitemap_products = sitemap_3[column_to_keep_sc]
Step #5: Start creating your XML sitemaps
# Create XML sitemap template require(whisker) require(httr) tpl <- ' <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{#links}} <url> <loc>{{{loc}}}</loc> <lastmod>{{{lastmod}}}</lastmod> <changefreq>{{{changefreq}}}</changefreq> <priority>{{{priority}}}</priority> </url> {{/links}} </urlset> ' links = as.character(sitemap_articles$Url) map_links <- function(l) { tmp <- GET(l) d <- tmp$headers[['last-modified']] list(loc=l, #lastmod=format(as.Date(d,format="%Y.%m.%d")), lastmod=format(Sys.time(), "%Y-%m-%d"), changefreq="monthly", priority="0.8") } print(map_links) links <- lapply(links, map_links) print(links) cat(whisker.render(tpl),file="sitemap_article_moz.xml") # Start again for each sitemap links = as.character(sitemap_learn$Url) map_links <- function(l) { tmp <- GET(l) d <- tmp$headers[['last-modified']] list(loc=l, #lastmod=format(as.Date(d,format="%Y.%m.%d")), lastmod=format(Sys.time(), "%Y-%m-%d"), changefreq="monthly", priority="0.8") } print(map_links) links <- lapply(links, map_links) print(links) cat(whisker.render(tpl),file="sitemap_learn_moz.xml")
Full script 🤙
setwd("~/Desktop/RSTUDIO/SITEMAP") # Je charge dans une variable la liste des librairies que je vais utiliser packages <- c("Rcrawler","dplyr", "stringr","xml2","tidyverse","tidyr") # fonction permettant d'installer automatiquement les librairies n?cessaires & non install?es if (length(setdiff(packages, rownames(installed.packages()))) > 0) { install.packages(setdiff(packages, rownames(installed.packages()))) } library(Rcrawler) library(dplyr) library(stringr) library(xml2) library(tidyverse) library(tidyr) #1 - I SET UP MY CRAWL CustomLabels <- c("canonical_tag", "meta_robots") CustomXPaths <- c("//link[@rel='canonical']/@href", "//meta[@name='robots']/@content") #2 - WE START CRAWLING Rcrawler(Website = "https://moz.com/blog", ExtractXpathPat = CustomXPaths, PatternsNames = CustomLabels) #3 - WE MERGE DATA crawl <-data.frame(do.call("rbind", DATA)) crawl_complete <- cbind(INDEX,crawl) Idurl = as.numeric(crawl_complete$Id) crawl_complete = cbind(Idurl,crawl_complete) #2 - I IDENTIFY INDEXABLE PAGES crawl_complete <- mutate(crawl_complete, Canonical_indexability = ifelse(Url == canonical_tag | is.na(crawl_complete$canonical_tag), "Canonical Matching", "Canonical not Matching")) crawl_complete <- mutate(crawl_complete, indexation = ifelse(grepl("NOINDEX|noindex", crawl_complete$meta_robots), "Non Indexable", "Indexable")) #3 - I KEEP ONLY INDEXABLE PAGES crawl_complete_indexable_pages <- filter(crawl_complete, indexation =="Indexable" & Canonical_indexability == "Canonical Matching" ) #4 - I KEEP ONLY URLS column_to_keep_sc <- c("Url") crawl_complete_indexable_pages = INDEX[column_to_keep_sc] #5 - I SPLIT BY PAGE TYPE (NOT NECESSARY) crawl <- mutate(crawl_complete_indexable_pages, category = ifelse(str_detect(crawl_complete_indexable_pages$Url,"/blog/"), "Articles", ifelse(str_detect(crawl_complete_indexable_pages$Url, "/learn/"), "Learn pages", ifelse(str_detect(crawl_complete_indexable_pages$Url, "Products"),"Products", "Community")))) #6 - I CREATE SEVERAL FILES sitemap_1 <- filter(crawl, category =="Articles") column_to_keep_sc <- c("Url") sitemap_articles = sitemap_1[column_to_keep_sc] sitemap_2 <- filter(crawl, category =="Learn pages") column_to_keep_sc <- c("Url") sitemap_learn = sitemap_2[column_to_keep_sc] sitemap_3 <- filter(crawl, category =="Products") column_to_keep_sc <- c("Url") sitemap_products = sitemap_3[column_to_keep_sc] #4 - FROM THESE FILES, I CREATE SITEMAPS require(whisker) require(httr) tpl <- ' <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{#links}} <url> <loc>{{{loc}}}</loc> <lastmod>{{{lastmod}}}</lastmod> <changefreq>{{{changefreq}}}</changefreq> <priority>{{{priority}}}</priority> </url> {{/links}} </urlset> ' links = as.character(sitemap_articles$Url) map_links <- function(l) { tmp <- GET(l) d <- tmp$headers[['last-modified']] list(loc=l, #lastmod=format(as.Date(d,format="%Y.%m.%d")), lastmod=format(Sys.time(), "%Y-%m-%d"), changefreq="monthly", priority="0.8") } print(map_links) links <- lapply(links, map_links) print(links) cat(whisker.render(tpl),file="sitemap_articles_moz.xml")
0 commentaires