Links:
Course Repository: https://github.com/HumblePasty/EAS648
Lab05 Webpage: https://humblepasty.github.io/EAS648/Lab/05/
Lab05 Repository: https://github.com/HumblePasty/EAS648/tree/master/Lab/05
Assignment
- Analyze the Salt Lake City dataset. Fit a model that explains suitability for urban development (provide evidence of goodness of model fit). What characteristics contribute to urban development in Salt Lake? What variable might you add to better explain urban development. (1-2 paragraphs)
- Analyze the database of amenity landscape characteristics using a clustering technique. Choose variables based on a thematic inquiry of your choice. Visualize ad provide qualitative descriptions of the groups that you derive from the clustering techniques. Provide evidence of you characterizations. Provide a map of your cluster analysis. Give an explanation of the spatial distribution of these groups. (1-2 paragraphs)
Haolin’s Note:
For this assignment, I’ll complete the tasks in 2 parts:
- Part I: Analyze the Salt Lake City dataset and Model Fitting
- Part II: Cluster Analysis of Landscape Characteristics
library(terra)
## terra 1.7.55
#first import the land cover/use data
NLCD_2001 <- rast("Lab5prj/NLCD_2001_SL.tif")
NLCD_2004 <- rast("Lab5prj/NLCD_2004_SL.tif")
NLCD_2006 <- rast("Lab5prj/NLCD_2006_SL.tif")
NLCD_2008 <- rast("Lab5prj/NLCD_2008_SL.tif")
NLCD_2011 <- rast("Lab5prj/NLCD_2011_SL.tif")
NLCD_2013 <- rast("Lab5prj/NLCD_2013_SL.tif")
NLCD_2016 <- rast("Lab5prj/NLCD_2016_SL.tif")
# Distance to parks and protected areas in km (Euclidean) for the study area
Park_dist <- rast("Lab5prj/Parks_dist_SL.tif")
# Road density for a 1 km neighborhood
Rd_dns1km <- rast("Lab5prj/Rd_dns1km_SL.tif")
# Distance to water bodies in km (Euclidean)
WaterDist <- rast("Lab5prj/WaterDist_SL.tif")
# Elevation
DEM <- rast("Lab5prj/DEM_SL.tif")
allrasters <- c(NLCD_2001, NLCD_2004, NLCD_2006, NLCD_2008, NLCD_2011, NLCD_2013, NLCD_2016, Park_dist, Rd_dns1km, WaterDist,DEM)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks terra::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
allrastersSL <- as.data.frame(allrasters, xy=TRUE)
## Here we are filtering out the no data values (stored as 128)
allrastersSL <- allrastersSL %>%
filter (NLCD_2001_SL != 128)
head(allrastersSL)
## x y NLCD_2001_SL NLCD_2004_SL NLCD_2006_SL NLCD_2008_SL
## 1 -1330860 2102550 11 52 52 95
## 2 -1330830 2102550 11 52 52 95
## 3 -1330800 2102550 11 11 11 11
## 4 -1330770 2102550 11 11 11 11
## 5 -1330740 2102550 11 11 11 11
## 6 -1330710 2102550 11 11 71 71
## NLCD_2011_SL NLCD_2013_SL NLCD_2016_SL Parks_dist_SL Rd_dns1km_SL
## 1 95 95 95 1358.308 0
## 2 95 95 95 1343.317 0
## 3 95 11 11 1328.834 0
## 4 95 11 11 1314.876 0
## 5 95 11 95 1301.461 0
## 6 71 71 71 1288.255 0
## WaterDist_SL DEM_SL
## 1 301.4963 1281
## 2 305.9412 1281
## 3 313.2092 1281
## 4 323.1099 1281
## 5 330.0000 1281
## 6 331.3608 1281
Pre-plotting the Data
plot(NLCD_2001)
library(leaflet)
library(ggplot2)
sampleSLrnd <- spatSample(allrasters, size=100, "random", cells=TRUE, xy=TRUE)
head(sampleSLrnd)
## cell x y NLCD_2001_SL NLCD_2004_SL NLCD_2006_SL NLCD_2008_SL
## 1 3204425 -1325760 2050800 NA NA NA NA
## 2 3124113 -1340880 2052090 52 52 52 52
## 3 1369006 -1320810 2080470 81 81 81 81
## 4 2396508 -1342470 2063850 52 52 52 52
## 5 2911941 -1302840 2055540 NA NA NA NA
## 6 1818460 -1311750 2073210 42 42 42 42
## NLCD_2011_SL NLCD_2013_SL NLCD_2016_SL Parks_dist_SL Rd_dns1km_SL
## 1 NA NA NA NA NA
## 2 52 52 52 18521.180 0.0000000
## 3 81 81 81 3113.069 0.3654729
## 4 52 52 52 12721.485 0.1037649
## 5 NA NA NA NA NA
## 6 42 42 42 0.000 0.0000000
## WaterDist_SL DEM_SL
## 1 NA NA
## 2 530.7542 1815
## 3 1772.2866 1400
## 4 391.1521 1580
## 5 NA NA
## 6 692.6038 2170
plot(NLCD_2001, main = "Sample Locations for Autocorrelation Analysis")
points(sampleSLrnd$x, sampleSLrnd$y)
# flatten the spatial data to a dataframe that has lat and long
flat_data <- as.data.frame(sampleSLrnd)
flat_data = na.omit(flat_data)
# calculate distances between all the points,
dist_matrix <- as.matrix(dist(cbind(flat_data$x, flat_data$y)))
# and generate a matrix of inverse distance weights.
dist_matix.inv <- 1/dist_matrix
diag(dist_matix.inv) <- 0
library(ape)
##
## Attaching package: 'ape'
## The following object is masked from 'package:dplyr':
##
## where
## The following objects are masked from 'package:terra':
##
## rotate, trans, zoom
# Moran.I(sampleSLrnd$Rd_dns1km_SL, dist_matix.inv)
Moran.I(flat_data$Rd_dns1km_SL, dist_matix.inv)
## $observed
## [1] 0.1491992
##
## $expected
## [1] -0.01754386
##
## $sd
## [1] 0.03219732
##
## $p.value
## [1] 2.233313e-07
allrastersSL <- allrastersSL %>%
mutate(urbanChg = (NLCD_2001_SL != 21 & NLCD_2001_SL != 22 & NLCD_2001_SL != 23 & NLCD_2001_SL != 24) & (NLCD_2016_SL == 21 | NLCD_2016_SL == 22 | NLCD_2016_SL == 23 | NLCD_2016_SL == 24))
ggplot(allrastersSL, aes(y=y, x=x, color=urbanChg)) +
geom_point(size=2, shape=15) +
ggtitle("Changed Urban Areas Distribution between 2001 and 2016") +
theme()
## calculate total new urban impervious for 2016
newUrban <- (sum(as.numeric(allrastersSL$NLCD_2016_SL == 21 | allrastersSL$NLCD_2016_SL == 22 |allrastersSL$NLCD_2016_SL == 23 | allrastersSL$NLCD_2016_SL == 24))) - (sum(as.numeric(allrastersSL$NLCD_2001_SL == 21| allrastersSL$NLCD_2001_SL == 22| allrastersSL$NLCD_2001_SL == 23| allrastersSL$NLCD_2001_SL == 24)))
## calculate total urban impervious for 2001
urban2001 <- (sum(as.numeric(allrastersSL$NLCD_2001_SL == 21| allrastersSL$NLCD_2001_SL == 22| allrastersSL$NLCD_2001_SL == 23| allrastersSL$NLCD_2001_SL == 24)))
## percentage increase in urban impervious
newUrban/urban2001* 100
## [1] 15.88895
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
SL <- allrastersSL %>%
# the non urban areas in 2001
filter(NLCD_2001_SL != 21 & NLCD_2001_SL != 22 & NLCD_2001_SL != 23 & NLCD_2001_SL != 24)
SL <- SL[10:14]
# similar to tidy(), melt is a function to organize the data
SLmelt<-melt(SL)
## Using urbanChg as id variables
p <- ggplot(SLmelt, aes(x=urbanChg, y=value,fill=variable))+
geom_boxplot()+
facet_grid(.~variable)+
labs(x="X (binned)")+
theme(axis.text.x=element_text(angle=-90, vjust=0.4,hjust=1))
p
###Grab all the developed cells (presence)
newUrban <- SL %>%
filter(urbanChg == TRUE)
###Grab all the nondeveloped and not previously urban cells (absence)
nonUrban <- SL %>%
filter(urbanChg == FALSE)
###Get a random sample of the absence data
### that is twice as large as the presence data
index <- sample(1:nrow(nonUrban), (round(nrow(newUrban)* 2)))
SLsampleUrban <- nonUrban[index, ]
### combine the original presence and absence data
SLsample <- rbind(SLsampleUrban, newUrban)
###Consider making a training and testing dataset
###This can reduce the computational burden
### It also is a robust goodness of fit method
SLsample <- SLsample %>% dplyr::mutate(id = row_number())
#Create training set
train <- SLsample %>% sample_frac(.70)
#Create test set
test <- anti_join(SLsample, train, by = 'id')
Fit a Model
fit <- glm(urbanChg ~ Parks_dist_SL + Rd_dns1km_SL + WaterDist_SL + DEM_SL,data=train,family=binomial())
summary(fit)
##
## Call:
## glm(formula = urbanChg ~ Parks_dist_SL + Rd_dns1km_SL + WaterDist_SL +
## DEM_SL, family = binomial(), data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.296e+00 6.806e-02 63.12 <2e-16 ***
## Parks_dist_SL 1.073e-04 1.609e-06 66.69 <2e-16 ***
## Rd_dns1km_SL 2.095e+01 1.168e-01 179.30 <2e-16 ***
## WaterDist_SL 2.050e-04 8.502e-06 24.11 <2e-16 ***
## DEM_SL -5.207e-03 5.135e-05 -101.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 281124 on 220709 degrees of freedom
## Residual deviance: 115834 on 220705 degrees of freedom
## AIC: 115844
##
## Number of Fisher Scoring iterations: 7
Test the data
## Loading required package: ggplots
library(ROCR)
# plot a ROC curve for a single prediction run
# and color the curve according to cutoff.
pred <- prediction(predict(fit, newdata = test), test$urbanChg)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
## A simpler way to understand these result is to calculate the
## area under the curve(AUC). The closer this number is to 1, the
## better your model fit
auc_ROCR <- performance(pred, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
auc_ROCR
## [1] 0.9567307
predicted = predict(allrasters, fit)
##
|---------|---------|---------|---------|
=========================================
plot(predicted)
Analysing the Fitting Goodness
The goodness of the fitting result can be assessed by two aspects:
- As shown by the
summary(fit)
result, all the explanatory variables are significant for the explaining the changed areas.- The ROC line is steep at the beginning and AUC is close to 1. Thus the model is good in fitting the result.
What characteristics contribute to urban development in Salt Lake?
According to the coefficients, distance to road has the largest value, which implies that distance to roads (which means transportation convenience) is the most significant (positive) factor that may lead to development. Distance to parks and distance to water have much smaller coefficients. The coefficent is negative, which can be intepret as “higher lands may require more effort to develop”.
What variable might you add to better explain urban development
- Normalized Difference Vegetation Index (NDVI): This index, derived from satellite imagery, measures vegetation health and coverage. It can be used to monitor green spaces within urban areas.
- Air Quality Indices: Using satellite data, it’s possible to estimate the concentration of different pollutants, providing insights into environmental aspects of urban areas.
- Nighttime Lights: Satellite imagery of nighttime lights is often used as a proxy for urbanization and economic activity.
- Land Surface Temperature: Using thermal imagery, land surface temperature can be a good indicator of urban heat islands and overall urbanization levels.
- …
Task:
Analyze the database of amenity landscape characteristics using a clustering technique. Choose variables based on a thematic inquiry of your choice. Visualize ad provide qualitative descriptions of the groups that you derive from the clustering techniques. Provide evidence of you characterizations. Provide a map of your cluster analysis. Give an explanation of the spatial distribution of these groups. (1-2 paragraphs)
library(fastcluster)
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
library(graphics)
library(ggplot2)
library(ggdendro)
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.7.2, PROJ 9.3.0; sf_use_s2() is TRUE
amenData<- st_read("Lab5prj/AmenDataAll.shp")
## Reading layer `AmenDataAll' from data source
## `D:\UMich\Fall23\EAS648\Repo\Lab\05\Lab5prj\AmenDataAll.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 79971 features and 138 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -2356114 ymin: 272043.6 xmax: 2263886 ymax: 3182044
## Projected CRS: NAD83 / Conus Albers
# head(amenData)
geomData = st_sfc(amenData$geometry)
Here I continue to use the set of variables from the tutorial to show amenity: - Distance to zoo
ZooDistEmp
- Distance to hotelHotelDistE
- Distance to an unknown featureHistMon_Di
- Distance to higher educationHigherEdDi
- Distance to golf siteGolfDistEm
- Distance to coastdistcoast
# log transform the distance to show importance of nearness
amenData$ZooEmpDist_log <- log(amenData$ZooDistEmp + 0.00000001)
amenData$HotelEmpDist_log <- log(amenData$HotelDistE + 0.00000001)
amenData$HistMonDist_log <- log(amenData$HistMon_Di + 0.00000001)
amenData$HighEdEmpDist_log <- log(amenData$HigherEdDi + 0.00000001)
amenData$GolfEmpDist_log <- log(amenData$GolfDistEm + 0.00000001)
amenData$distcoast_log <- log(amenData$distcoast + 0.00000001)
# amenData$SocialNorm <- amenData$Nat_Flickr/(amenData$serPop10 + 1)
# amenData$HousingChg <- amenData$Urb2011 - amenData$Urb2001
amenDataDF<-amenData[,c("distcoast_log", "ZooEmpDist_log", "HotelEmpDist_log","HistMonDist_log", "HighEdEmpDist_log", "GolfEmpDist_log")]
## make sure there are no missing data
amenDataDF <- na.omit(amenDataDF)
## we need to make it into a normal dataframe to
## do the analysis
amenDataDF <- as.data.frame(amenDataDF)[1:6]
## calculate z-scores for the dataset
db_scaled <- scale(amenDataDF)
Determine the number of clusters
# Determine number of clusters
wss <- (nrow(db_scaled)-1)*sum(apply(db_scaled,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(db_scaled,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
> Based on the graph, 3 cluster would be enough to classify the points
# K-Means Cluster Analysis
fit_new <- kmeans(db_scaled, 3) # 3 cluster solution
# get cluster means
cluster_means = aggregate(db_scaled,by=list(fit_new$cluster),FUN=mean)
cluster_means$Group.1 = as.factor(cluster_means$Group.1)
cluster_means
## Group.1 distcoast_log ZooEmpDist_log HotelEmpDist_log HistMonDist_log
## 1 1 0.1910249 0.1768522 0.1876527 0.1883689
## 2 2 -4.9999306 -4.8483172 -4.9382400 -4.9810208
## 3 3 0.1775885 0.1942809 0.1780721 0.1820108
## HighEdEmpDist_log GolfEmpDist_log
## 1 -0.002839913 0.1887934
## 2 -4.737587671 -4.9886419
## 3 0.652592318 0.1819301
Plot the Data
library(ggspatial)
library(grid)
##
## Attaching package: 'grid'
## The following object is masked from 'package:terra':
##
## depth
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
# append cluster assignment
amenData <- data.frame(amenData, fit_new$cluster)
st_geometry(amenData) <- geomData
# Create grobs annotation
label_text <- "Author: Haolin Li\nGCS: NAD 1983\nPCS: NAD83 / Conus Albers\nDate: 11/26/2023"
text_grob <- textGrob(label = label_text,
hjust = 1, vjust = 0,
gp = gpar(fontsize = 4))
text_grob <- editGrob(text_grob,x = unit(0.99, "npc"), y = unit(0.08, "npc"))
ggplot() +
geom_sf(data = amenData, mapping = aes(fill = as.factor(fit_new.cluster)), color = NA) +
ggtitle("Clusters based on Kmeans - Number of Clusters: 3") +
labs(fill = "Clusters:") +
# adding north arrow
annotation_north_arrow(location = "tr", which_north = "true",
style = north_arrow_fancy_orienteering()) +
# adding scale bar
annotation_scale(location = "br", width_hint = 0.5) +
# add annotion
annotation_custom(grob = text_grob) +
theme_bw()
Plot the line graph to indicate the meaning of the 3 clusters:
df_long <- gather(cluster_means, key = "Variable", value = "Value", -Group.1)
ggplot(df_long, aes(x = Variable, y = Value, group = Group.1, color = Group.1)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Line Plot of Means for 3 Clusters", x = "Variables", y = "Mean")
As shown by the line graph:
- Cluster 1 have high negative mean value for every variable except for
HighEdDist_log
which indicates that this cluster may imply the areas that are close to higher education institudes- Cluster 2 and 3 does not show significant preferences for a certain variable and is hard to tell the meaning
These characteristics are also shown on the map
Cluster 1 mainly concentrates in the urban areas, for example in Michigan, it mainly concentrates in the Greater Detroit Area (Detroit, Ann Arbor and Ypsilanti, etc). Cluster 1 also concentrates around the Tri-state area and around LA, where higher education institudes are common, further supports the assumption.
The End of Lab 05