Задание 2
Yasir
2 years ago
```library(rvest)
library(purrr)
library(stringr)

html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')%>%
gsub(pattern = "\n", replacement = "")
)

pop.largest <- as.data.frame(
matrix(
unlist(lapply( lines, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest\$cities <- str_match(pop.largest\$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest\$Население = as.integer(pop.largest\$Население)

html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)

pop.largest2 <- as.data.frame(
matrix(
unlist(lapply( lines2, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest2\$cities <- str_match(pop.largest2\$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest2\$Население = as.integer(pop.largest2\$Население)

html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')

)

pop.largest3 <- as.data.frame(
matrix(
unlist(lapply( lines3, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest3\$cities <- str_match(pop.largest3\$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest3\$Население = as.integer(pop.largest3\$Население)

html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)

pop.largest4 <- as.data.frame(
matrix(
unlist(lapply( lines4, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest4\$cities <- str_match(pop.largest4\$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest4\$Население = as.integer(pop.largest4\$Население)

html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)

pop.largest5 <- as.data.frame(
matrix(
unlist(lapply( lines5, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest5\$cities <- str_match(pop.largest5\$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest5\$Население = as.integer(pop.largest5\$Население)

html_nodes('ol') %>%
map( ~html_nodes(.x, 'li') %>%
html_text() %>%
gsub(pattern=' ', replacement='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*-\\w*-\\w*\\)', repl='') %>%
gsub(pattern='\\(\\w*\\)', repl='') %>%
gsub(pattern='Численностьнаселения', repl=' ') %>%
gsub(pattern='Количествожителей', repl=' ')%>%
gsub(pattern='\\(.+?\\)', repl='')
)

pop.largest6 <- as.data.frame(
matrix(
unlist(lapply( lines6, strsplit, split=' ')[1]),
byrow=TRUE, ncol=2,
dimnames=list(NULL, c("cities", "Население"))), stringsAsFactors = FALSE)

pop.largest6\$cities <- str_match(pop.largest6\$cities, "(\\w*|\\w*-\\w*)(\\[.*\\])?" )[,2]
pop.largest6\$Население = as.integer(pop.largest6\$Население)

pop.most <- rbind(pop.largest, pop.largest2, pop.largest3, pop.largest4, pop.largest5, pop.largest6)
hist(pop.most[[2]], breaks=100)```