Untitled

 avatar
unknown
plain_text
3 years ago
6.8 kB
4
Indexable
library(rvest)
library(purrr)
library(stringr)

url <- "https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BC%D0%B8%D0%BB%D0%BB%D0%B8%D0%BE%D0%BD%D0%B5%D1%80%D1%8B"
lines <- read_html(url) %>% 
  html_nodes('ol') %>%
  map( ~html_nodes(.x, 'li') %>% 
         html_text() %>%
         gsub(pattern=' ', replacement='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='Численностьнаселения', repl=' ') %>%
         gsub(pattern='Количествожителей', repl=' ')%>%
         gsub(pattern='\\(.+?\\)', repl='')%>%
         gsub(pattern = "\n", replacement = "")
  )

pop.largest <- as.data.frame(
  matrix( 
    unlist(lapply( lines, strsplit, split=' ')[1]), 
    byrow=TRUE, ncol=2,
    dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest$cities <- str_match(pop.largest$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest$Население = as.integer(pop.largest$Население)

url2 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BA%D1%80%D1%83%D0%BF%D0%BD%D0%B5%D0%B9%D1%88%D0%B8%D0%B5'
lines2 <- read_html(url2) %>% 
  html_nodes('ol') %>%
  map( ~html_nodes(.x, 'li') %>% 
         html_text() %>%
         gsub(pattern=' ', replacement='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='Численностьнаселения', repl=' ') %>%
         gsub(pattern='Количествожителей', repl=' ')%>%
         gsub(pattern='\\(.+?\\)', repl='')
  )

pop.largest2 <- as.data.frame(
  matrix( 
    unlist(lapply( lines2, strsplit, split=' ')[1]), 
    byrow=TRUE, ncol=2,
    dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest2$cities <- str_match(pop.largest2$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest2$Население = as.integer(pop.largest2$Население)

url3 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BA%D1%80%D1%83%D0%BF%D0%BD%D1%8B%D0%B5'
lines3 <- read_html(url3) %>% 
  html_nodes('ol') %>%
  map( ~html_nodes(.x, 'li') %>% 
         html_text() %>%
         gsub(pattern=' ', replacement='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='Численностьнаселения', repl=' ') %>%
         gsub(pattern='Количествожителей', repl=' ')%>%
         gsub(pattern='\\(.+?\\)', repl='')
       
  )

pop.largest3 <- as.data.frame(
  matrix( 
    unlist(lapply( lines3, strsplit, split=' ')[1]), 
    byrow=TRUE, ncol=2,
    dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest3$cities <- str_match(pop.largest3$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest3$Население = as.integer(pop.largest3$Население)

url4 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%B1%D0%BE%D0%BB%D1%8C%D1%88%D0%B8%D0%B5'
lines4 <- read_html(url4) %>% 
  html_nodes('ol') %>%
  map( ~html_nodes(.x, 'li') %>% 
         html_text() %>%
         gsub(pattern=' ', replacement='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='Численностьнаселения', repl=' ') %>%
         gsub(pattern='Количествожителей', repl=' ')%>%
         gsub(pattern='\\(.+?\\)', repl='')
  )

pop.largest4 <- as.data.frame(
  matrix( 
    unlist(lapply( lines4, strsplit, split=' ')[1]), 
    byrow=TRUE, ncol=2,
    dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest4$cities <- str_match(pop.largest4$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest4$Население = as.integer(pop.largest4$Население)

url5 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D1%81%D1%80%D0%B5%D0%B4%D0%BD%D0%B8%D0%B5'
lines5 <- read_html(url5) %>% 
  html_nodes('ol') %>%
  map( ~html_nodes(.x, 'li') %>% 
         html_text() %>%
         gsub(pattern=' ', replacement='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='Численностьнаселения', repl=' ') %>%
         gsub(pattern='Количествожителей', repl=' ')%>%
         gsub(pattern='\\(.+?\\)', repl='')
  )

pop.largest5 <- as.data.frame(
  matrix( 
    unlist(lapply( lines5, strsplit, split=' ')[1]), 
    byrow=TRUE, ncol=2,
    dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest5$cities <- str_match(pop.largest5$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest5$Население = as.integer(pop.largest5$Население)

url6 <- 'https://xn----7sbiew6aadnema7p.xn--p1ai/reytin-cities.php?name=%D0%BC%D0%B0%D0%BB%D1%8B%D0%B5'
lines6 <- read_html(url6) %>% 
  html_nodes('ol') %>%
  map( ~html_nodes(.x, 'li') %>% 
         html_text() %>%
         gsub(pattern=' ', replacement='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
         gsub(pattern='\\(\\w*\\)', repl='') %>%
         gsub(pattern='Численностьнаселения', repl=' ') %>%
         gsub(pattern='Количествожителей', repl=' ')%>%
         gsub(pattern='\\(.+?\\)', repl='')
  )

pop.largest6 <- as.data.frame(
  matrix( 
    unlist(lapply( lines6, strsplit, split=' ')[1]), 
    byrow=TRUE, ncol=2,
    dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest6$cities <- str_match(pop.largest6$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest6$Население = as.integer(pop.largest6$Население)

pop.most <- rbind(pop.largest, pop.largest2, pop.largest3, pop.largest4, pop.largest5, pop.largest6)
hist(pop.most[[2]], breaks=100)
Editor is loading...