Code
require(tidyverse)
require(kableExtra)
require(totalcensus)
require(tidycensus)
require(hrbrthemes)
require(janitor)
require(tigris)
require(sf)
require(urbnmapr)
options(scipen = 999)The statistical software environment R was used to create this analysis. It is an open-source software for statistical computing and graphics.
The following R packages are required to produce the results generated in the blog post:
tidyversekableExtratotalcensustidycensushrbrthemesjanitortigrissfurbnmaprTo install packages, use the install.packages("mypackage") command in the R console, where "mypackage" is the intended package. Use the following code to load the required packages.
require(tidyverse)
require(kableExtra)
require(totalcensus)
require(tidycensus)
require(hrbrthemes)
require(janitor)
require(tigris)
require(sf)
require(urbnmapr)
options(scipen = 999)This blog post uses the following data sources:
FHFA Uniform Appraisal Dataset (UAD) Appraisal-Level Public Use File (PUF) Version 1.1
uad_puf_v1_1.rdsFHFA Duty to Serve Rural Areas data
rural2021.txt2010 Decennial Census - Housing Units
H001001 (Automatically extracted in the code using an API query)2019_Gaz_tracts_national.txtSimply gather the required data above and place the files in same directory as the code. The following code reads in the data and merges the files together based on census tract.
# load in the UAD Appraisal-Level Public Use File
uad_puf_v1_1 <- readRDS("uad_puf_v1_1.rds") %>% filter(year %in% (2013:2021))
# read in 2021 version of the rural areas file with 2010 tract boundaries
rural_lookup <- read.table(file = "rural2021.txt",
header = TRUE,
colClasses = c(
"character",
"character",
"character",
"character",
"character",
"character",
"character",
"character")) %>%
unite("tract_2010",
STATE:COUNTY:TRACT,
sep = "",
remove = FALSE) %>%
arrange(STATE, COUNTY, TRACT)
# deduplicate St. Louis MSA and select only desired columns
rural_lookup <- rural_lookup %>%
add_count(tract_2010) %>%
#filter(n > 1) %>%
arrange(desc(MSA2018)) %>%
distinct(tract_2010, .keep_all = TRUE) %>%
select(tract_2010, RURAL)
# gather tract level census housing units data
housing_units <- tigris::fips_codes %>%
filter(state_code < 60) %>%
pull(state_code) %>%
unique() %>%
map_dfr( ~ get_decennial(state = .,
"tract",
variables = "H001001",
year = 2010)) %>%
rename(tract_2010 = GEOID,
hous_units = value) %>%
select(tract_2010, hous_units)
# read in 2019 GAZ file
gaz2019 <- read.table(file = "2019_Gaz_tracts_national.txt",
colClasses = c("character",
"character",
"numeric",
"numeric",
"numeric",
"numeric",
"numeric",
"numeric"),
header = TRUE) %>%
select(GEOID, ALAND_SQMI) %>%
rename(tract_2010 = GEOID)
# merge the above sources to the UAD Appraisal-Level Public Use File
uad_puf_mrg <- uad_puf_v1_1 %>%
left_join(rural_lookup, by = ("tract_2010")) %>%
left_join(housing_units, by = ("tract_2010")) %>%
left_join(gaz2019, by = "tract_2010")The following categories are filtered out of the subset used to create the charts in the blog post. Note that some might be overlapping.
Overall, 1,308,850 records remain, which is about 95% of the total records in the UAD Public Use File.
# recode variables
uad_puf_mrg <- uad_puf_mrg %>%
# compute housing density measurement and categorize appraisals by location
mutate(husqmi = hous_units / ALAND_SQMI,
location = case_when(
RURAL == "1" ~ "Rural",
RURAL == "0" & husqmi >= 16 & husqmi <= 1600 ~ "Urban (low-density)",
RURAL == "0" & husqmi > 1600 ~ "Urban (high-density)",
.default = NA),
# create a label for the number of comps variable
num_comps_label = recode(number_comparables,
"1" = "3",
"2" = "4",
"3" = "5",
"4" = "6",
"5" = "7 +",
"9" = "Missing"),
# create a 5+ comps label
five_plus_comps_label = recode(number_comparables,
"1" = "3 - 4",
"2" = "3 - 4",
"3" = "5 +",
"4" = "5 +",
"5" = "5 +",
"9" = "Missing"),
# create a numeric binary variable for 5+ comps
five_plus_comps = dplyr::recode(number_comparables,
"1" = 0,
"2" = 0,
"3" = 1,
"4" = 1,
"5" = 1))
# if there are NA values in location, replace them with Missing
uad_puf_mrg$location <- uad_puf_mrg$location %>% replace_na("Missing")
# create the subset for charting and analysis
blog_subset <- uad_puf_mrg %>%
# non missing num_comp categories
filter(number_comparables %in% c('1', '2', '3', '4', '5')) %>%
# only want purchase and refi
filter(purpose == '1' | purpose == '2') %>%
# filter out missing values for location
filter(location != "Missing") %>%
# filter out Puerto Rico
filter(state_fips != '72')
# cast location as a factor and assign levels
blog_subset$location <- factor(blog_subset$location,
levels = c("Urban (high-density)",
"Urban (low-density)",
"Rural"))
# cast state_fips as a factor variable and convert to 2 letter state abbreviation
blog_subset$state <- blog_subset$state_fips %>%
convert_fips_to_names()Figure 1 represents the distribution of the comparable properties used in home appraisals acquired by the Enterprises.
# chart the overall distribution
blog_subset %>%
count(num_comps_label) %>%
mutate(total_num_comps_label = sum(n),
perc = n / total_num_comps_label) %>%
ggplot(aes(x = num_comps_label, y = perc)) +
geom_col(fill = "#0570b0") +
scale_y_continuous(
labels = scales::percent,
breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3),
limits = c(0, 0.3)) +
labs(
title = "Figure 1: Percentage of Appraisals by Number of Comps (2013 – 2021)",
caption = paste(
paste('Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)'),
str_wrap('Note: Data have been restricted to only purchase and refinance appraisals that are not missing the “number of comps” field in the UAD PUF and can be assigned a location category. Additionally, appraisals in Puerto Rico are excluded. In total, 4.7 percent of the UAD PUF records are excluded.',125), sep = '\n'),
x = "Number of Comps",
y = "") +
theme_ipsum(
axis_title_just = "cc",
plot_title_size = 20,
axis_title_size = 16,
caption_size = 11) +
theme(
plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(margin = margin(b = 16)),
axis.title.y = element_text(margin = margin(r = 16)),
axis.title.x = element_text(margin = margin(t = 16)),
axis.text.x = element_text(size = 12, vjust = 0.5),
axis.text.y = element_text(size = 12, vjust = 0.5),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank())Figure 2 represents the trend in the percentage of appraisals with five or more comparable properties over time.
# create a subset of the data for use in the time series chart
time_series_data <- blog_subset %>%
select(year, five_plus_comps)
# convert year to date
time_series_data$year <- lubridate::ymd(time_series_data$year, truncated = 2L)
# compute the percent of 5+ comps by year, chart that percent from 2013 - 2021
time_series_data %>%
count(year, five_plus_comps) %>%
group_by(year) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
ungroup() %>%
mutate(perc = n / total) %>%
filter(five_plus_comps == 1) %>%
transmute(year = year(year), perc) %>%
ggplot(aes(x=year, y=perc)) +
geom_line(color="#0570b0", size = 1) +
geom_point(color ="#0570b0", size=2.5) +
scale_x_continuous(breaks = 2013:2021) +
scale_y_continuous(labels = scales::percent,
limits = c(0, 0.9),
breaks = seq(0, 0.9, 0.1),
expand = expansion(mult = 0)) +
labs(
title = "Figure 2: Percentage of Appraisals with Five or More Comps (2013 - 2021)",
caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)',
x = "Year",
y = "") +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.title = element_text(margin = margin(b = 30), hjust = 0.5),
plot.subtitle = element_text(margin = margin(b = 16)),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
axis.text = ggplot2::element_text(),
axis.text.x = ggplot2::element_text(size = 12, vjust = 0.5,
margin = ggplot2::margin(t = 4L)),
axis.text.y = ggplot2::element_text(size = 12, hjust = 1),
axis.text.x.top = NULL,
axis.text.y.right = NULL,
axis.ticks.length.x = NULL,
axis.ticks.length.x.top = NULL,
axis.ticks.length.x.bottom = NULL,
axis.ticks.length.y = NULL,
axis.ticks.length.y.left = NULL,
axis.ticks.length.y.right = NULL,
axis.title = ggplot2::element_text(face = "italic"),
axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 16)),
axis.title.y = ggplot2::element_text(angle = 90L,
margin = ggplot2::margin(r = 16)),
axis.title.x.top = NULL,
axis.title.y.right = NULL,
axis.ticks = ggplot2::element_line(),
axis.ticks.length = ggplot2::unit(4L, "pt"),
axis.ticks.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.ticks.y = ggplot2::element_blank(),
axis.line = ggplot2::element_line(),
axis.line.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.line.y = ggplot2::element_blank())Figure 3a shows each state’s percentage of appraisals with five or more comparable properties in descending rank order. Figure 3b displays a corresponding map by state.
# chart the percent of appraisals with 5+ comps by state
blog_subset %>%
count(state, five_plus_comps) %>%
group_by(state) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
ungroup() %>%
mutate(perc = n / total) %>%
filter(five_plus_comps == 1) %>%
arrange(perc) %>%
mutate(state = factor(state, levels = .$state)) %>%
ggplot(aes(perc, state)) +
geom_segment(aes(x = 0, xend = perc, y = state, yend = state)) +
geom_point(color="#0570b0", size=2.5) +
scale_x_continuous(labels = scales::percent, expand = expansion(mult = c(0, 0)), limits = c(0, 1)) +
labs(
title = "Figure 3a: Percentage of Appraisals with Five or More Comps by State (2013 - 2021)",
x = " ",
y = " ",
caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)') +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
axis.text = ggplot2::element_text(),
axis.text.x = ggplot2::element_text(size = 12, vjust = 0.5,
margin = ggplot2::margin(t = 4L)),
axis.text.y = ggplot2::element_text(size = 10.5, hjust = 1),
axis.text.x.top = NULL,
axis.text.y.right = NULL,
axis.ticks.length.x = NULL,
axis.ticks.length.x.top = NULL,
axis.ticks.length.x.bottom = NULL,
axis.ticks.length.y = NULL,
axis.ticks.length.y.left = NULL,
axis.ticks.length.y.right = NULL,
axis.title = ggplot2::element_text(face = "italic"),
axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 16)),
axis.title.y = ggplot2::element_text(angle = 90L,
margin = ggplot2::margin(r = 16)),
axis.title.x.top = NULL,
axis.title.y.right = NULL,
axis.ticks = ggplot2::element_line(),
axis.ticks.length = ggplot2::unit(4L, "pt"),
axis.ticks.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.ticks.y = ggplot2::element_blank(),
axis.line = ggplot2::element_line(),
axis.line.x = ggplot2::element_line(colour = NULL,
size = NULL,
linetype = NULL,
lineend = NULL),
axis.line.y = ggplot2::element_blank())# get state geometry
states <- tigris::states(progress_bar = FALSE) %>% select(STUSPS, geometry)
state_geometry <- shift_geometry(st_transform(states, "ESRI:102003"))
# compute percent of appraisals with 5+ comps by state
state_data <- blog_subset %>%
count(state, five_plus_comps) %>%
group_by(state) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
ungroup() %>%
mutate(perc = n / total) %>%
filter(five_plus_comps == 1) %>%
arrange(perc) %>%
mutate(state = factor(state, levels = .$state))
# join the percentage on the state geometry
mapdata <- inner_join(state_geometry, state_data, join_by(STUSPS == state))
# create a map of the percent of appraisals with 5+ comps by state
mapdata %>%
ggplot() +
geom_sf(mapping = aes(fill = perc),
color = "white") +
scale_fill_gradient(
low = "#F3F3F3",
high = "#4080bf",
breaks = c(0, 0.5, 1),
labels = c("0%", "50%", "100%"),
limits = c(0, 1)) +
labs(
fill = NULL,
title = "Figure 3b: Percentage of Appraisals with Five or More Comps by State (2013 - 2021)",
caption = 'Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)') +
geom_sf_text(data = get_urbn_labels(map = "states", sf = TRUE),
aes(label = state_abbv),
size = 3) +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.margin = unit(c(1, 2, 1, 1), "cm"),
legend.position = c(0.97, 0.45),
plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
legend.text=element_text(size=11),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank())Figure 4 below shows how the number of comparable properties varies by high-density urban areas, low-density urban areas, and rural areas.
# create a table of the count of appraisals per location category
blog_subset %>%
count(location) %>%
mutate(total = sum(n, na.rm = TRUE),
Percent = n / total) %>%
mutate(across(n, scales::label_comma())) %>%
mutate(across(Percent, scales::label_percent())) %>%
rename(Appraisals=n) %>%
select(!total) %>%
kbl() %>%
kable_styling(bootstrap_options = c("bordered", "hover", full_width = F))| location | Appraisals | Percent |
|---|---|---|
| Urban (high-density) | 330,582 | 25.3% |
| Urban (low-density) | 752,686 | 57.5% |
| Rural | 225,582 | 17.2% |
# chart the percent of appraisals with 5+ comps by location category
blog_subset %>%
ggplot(aes(x = five_plus_comps_label,
y = after_stat(prop),
fill = location,
group = location)) +
geom_bar(position = position_dodge()) +
scale_fill_manual(values = c("#74a9cf", "#2b8cbe", "#045a8d")) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 0.8, 0.1), limits = c(0,0.8)) +
labs(
title = "Figure 4: Percentage of Appraisals with Five or More Comps (2013 - 2021)",
caption = paste(
paste('Source: FHFA, UAD Appraisal-Level Public Use File (UAD PUF)'),
str_wrap('Note: Rural tracts assigned based on FHFA Duty to Serve definitions. For non-rural tracts, low-density is defined as having 16-1,600 housing units per square mile. High-density urban areas are non-rural tracts with greater than 1,600 housing units per square mile. Rural: 225,582 appraisals (17.2%); Urban (low-density): 752,686 appraisals (57.5%); Urban (high-density): 330,582 appraisals (25.3%).',125), sep = '\n'),
x = "Number of Comps",
y = "",
fill = "") +
theme_ipsum(axis_title_just = "cc", plot_title_size = 20, axis_title_size = 16, caption_size = 11) +
theme(plot.title = element_text(margin = margin(b = 10)),
plot.subtitle = element_text(size = 14, margin = margin(b = 16)),
axis.title.y = element_text(margin = margin(r = 16)),
axis.title.x = element_text(margin = margin(t = 16)),
axis.text.x = element_text(size = 12, vjust = 0.5),
axis.text.y = element_text(size = 12, vjust = 0.5),
legend.text=element_text(size=11),
plot.caption = element_text(size = 12, margin = margin(t = 22), hjust = 0),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank())