Untitled
unknown
plain_text
3 years ago
6.8 kB
4
Indexable
library(rvest) library(purrr) library(stringr) url <- "https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BC%D0%B8%D0%BB%D0%BB%D0%B8%D0%BE%D0%BD%D0%B5%D1%80%D1%8B" lines <- read_html(url) %>% html_nodes('ol') %>% map( ~html_nodes(.x, 'li') %>% html_text() %>% gsub(pattern=' ', replacement='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='Численностьнаселения', repl=' ') %>% gsub(pattern='Количествожителей', repl=' ')%>% gsub(pattern='\\(.+?\\)', repl='')%>% gsub(pattern = "\n", replacement = "") ) pop.largest <- as.data.frame( matrix( unlist(lapply( lines, strsplit, split=' ')[1]), byrow=TRUE, ncol=2, dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE) pop.largest$cities <- str_match(pop.largest$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2] pop.largest$Население = as.integer(pop.largest$Население) url2 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BA%D1%80%D1%83%D0%BF%D0%BD%D0%B5%D0%B9%D1%88%D0%B8%D0%B5' lines2 <- read_html(url2) %>% html_nodes('ol') %>% map( ~html_nodes(.x, 'li') %>% html_text() %>% gsub(pattern=' ', replacement='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='Численностьнаселения', repl=' ') %>% gsub(pattern='Количествожителей', repl=' ')%>% gsub(pattern='\\(.+?\\)', repl='') ) pop.largest2 <- as.data.frame( matrix( unlist(lapply( lines2, strsplit, split=' ')[1]), byrow=TRUE, ncol=2, dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE) pop.largest2$cities <- str_match(pop.largest2$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2] pop.largest2$Население = as.integer(pop.largest2$Население) url3 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BA%D1%80%D1%83%D0%BF%D0%BD%D1%8B%D0%B5' lines3 <- read_html(url3) %>% html_nodes('ol') %>% map( ~html_nodes(.x, 'li') %>% html_text() %>% gsub(pattern=' ', replacement='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='Численностьнаселения', repl=' ') %>% gsub(pattern='Количествожителей', repl=' ')%>% gsub(pattern='\\(.+?\\)', repl='') ) pop.largest3 <- as.data.frame( matrix( unlist(lapply( lines3, strsplit, split=' ')[1]), byrow=TRUE, ncol=2, dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE) pop.largest3$cities <- str_match(pop.largest3$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2] pop.largest3$Население = as.integer(pop.largest3$Население) url4 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%B1%D0%BE%D0%BB%D1%8C%D1%88%D0%B8%D0%B5' lines4 <- read_html(url4) %>% html_nodes('ol') %>% map( ~html_nodes(.x, 'li') %>% html_text() %>% gsub(pattern=' ', replacement='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='Численностьнаселения', repl=' ') %>% gsub(pattern='Количествожителей', repl=' ')%>% gsub(pattern='\\(.+?\\)', repl='') ) pop.largest4 <- as.data.frame( matrix( unlist(lapply( lines4, strsplit, split=' ')[1]), byrow=TRUE, ncol=2, dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE) pop.largest4$cities <- str_match(pop.largest4$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2] pop.largest4$Население = as.integer(pop.largest4$Население) url5 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D1%81%D1%80%D0%B5%D0%B4%D0%BD%D0%B8%D0%B5' lines5 <- read_html(url5) %>% html_nodes('ol') %>% map( ~html_nodes(.x, 'li') %>% html_text() %>% gsub(pattern=' ', replacement='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='Численностьнаселения', repl=' ') %>% gsub(pattern='Количествожителей', repl=' ')%>% gsub(pattern='\\(.+?\\)', repl='') ) pop.largest5 <- as.data.frame( matrix( unlist(lapply( lines5, strsplit, split=' ')[1]), byrow=TRUE, ncol=2, dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE) pop.largest5$cities <- str_match(pop.largest5$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2] pop.largest5$Население = as.integer(pop.largest5$Население) url6 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BC%D0%B0%D0%BB%D1%8B%D0%B5' lines6 <- read_html(url6) %>% html_nodes('ol') %>% map( ~html_nodes(.x, 'li') %>% html_text() %>% gsub(pattern=' ', replacement='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>% gsub(pattern='\\(\\w*\\)', repl='') %>% gsub(pattern='Численностьнаселения', repl=' ') %>% gsub(pattern='Количествожителей', repl=' ')%>% gsub(pattern='\\(.+?\\)', repl='') ) pop.largest6 <- as.data.frame( matrix( unlist(lapply( lines6, strsplit, split=' ')[1]), byrow=TRUE, ncol=2, dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE) pop.largest6$cities <- str_match(pop.largest6$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2] pop.largest6$Население = as.integer(pop.largest6$Население) pop.most <- rbind(pop.largest, pop.largest2, pop.largest3, pop.largest4, pop.largest5, pop.largest6) hist(pop.most[[2]], breaks=100)
Editor is loading...