Untitled
unknown
plain_text
4 years ago
6.8 kB
6
Indexable
library(rvest)
library(purrr)
library(stringr)
url <- "https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BC%D0%B8%D0%BB%D0%BB%D0%B8%D0%BE%D0%BD%D0%B5%D1%80%D1%8B"
lines <- read_html(url) %>%
html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')%>%
gsub(pattern = "\n", replacement = "")
)
pop.largest <- as.data.frame(
matrix(
unlist(lapply( lines, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)
pop.largest$cities <- str_match(pop.largest$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest$Население = as.integer(pop.largest$Население)
url2 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BA%D1%80%D1%83%D0%BF%D0%BD%D0%B5%D0%B9%D1%88%D0%B8%D0%B5'
lines2 <- read_html(url2) %>%
html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)
pop.largest2 <- as.data.frame(
matrix(
unlist(lapply( lines2, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)
pop.largest2$cities <- str_match(pop.largest2$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest2$Население = as.integer(pop.largest2$Население)
url3 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BA%D1%80%D1%83%D0%BF%D0%BD%D1%8B%D0%B5'
lines3 <- read_html(url3) %>%
html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)
pop.largest3 <- as.data.frame(
matrix(
unlist(lapply( lines3, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)
pop.largest3$cities <- str_match(pop.largest3$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest3$Население = as.integer(pop.largest3$Население)
url4 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%B1%D0%BE%D0%BB%D1%8C%D1%88%D0%B8%D0%B5'
lines4 <- read_html(url4) %>%
html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)
pop.largest4 <- as.data.frame(
matrix(
unlist(lapply( lines4, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)
pop.largest4$cities <- str_match(pop.largest4$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest4$Население = as.integer(pop.largest4$Население)
url5 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D1%81%D1%80%D0%B5%D0%B4%D0%BD%D0%B8%D0%B5'
lines5 <- read_html(url5) %>%
html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)
pop.largest5 <- as.data.frame(
matrix(
unlist(lapply( lines5, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)
pop.largest5$cities <- str_match(pop.largest5$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest5$Население = as.integer(pop.largest5$Население)
url6 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BC%D0%B0%D0%BB%D1%8B%D0%B5'
lines6 <- read_html(url6) %>%
html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)
pop.largest6 <- as.data.frame(
matrix(
unlist(lapply( lines6, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)
pop.largest6$cities <- str_match(pop.largest6$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest6$Население = as.integer(pop.largest6$Население)
pop.most <- rbind(pop.largest, pop.largest2, pop.largest3, pop.largest4, pop.largest5, pop.largest6)
hist(pop.most[[2]], breaks=100)Editor is loading...