# calculations
library(geomorph) # GPA
library(Morpho) # CVA
library(reshape2) # melt
library(IdentiFlyR) # classification
library(stringr) #str_split_fixed
# plotting and visualization
library(ggplot2) # plots
::theme_set(theme_light())
ggplot2library(rnaturalearth) # maps
library(raster) # raster
library(sf) # st_bbox
library(ggpubr) # ggarange
Analysis of honey bee wing shape in southwestern Asia
Libraries
Variable names
<- 19 # number of landmarks
p <- 2 # number of dimensions, in this case 2 for coordinates (x, y)
k
# create coordinates names used by IdentiFly
<- c("x1", "y1")
xyNames for (i in 2:p) {
<- c(xyNames, paste0("x", i))
xyNames <- c(xyNames, paste0("y", i))
xyNames
} xyNames
[1] "x1" "y1" "x2" "y2" "x3" "y3" "x4" "y4" "x5" "y5" "x6" "y6"
[13] "x7" "y7" "x8" "y8" "x9" "y9" "x10" "y10" "x11" "y11" "x12" "y12"
[25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
[37] "x19" "y19"
# The number of principal components used is 2*p-4 = 34, which is equal to the
# degrees of freedom create principal components names used by prcomp
<- paste0("PC", 1:(2 * p - 4))
pcNames pcNames
[1] "PC1" "PC2" "PC3" "PC4" "PC5" "PC6" "PC7" "PC8" "PC9" "PC10"
[11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
[21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
[31] "PC31" "PC32" "PC33" "PC34"
# define which landmarks are connected by lines in wireframe graph
<- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14,
link.x 15, 16, 17)
<- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13,
link.y 18, 15, 16, 17, 18)
<- cbind(link.x, link.y) links.apis
Read data
Read raw coordinates of 19 landmarks from Zenodo (Machlowska et al., 2025) Data for Kazakhstan refer to previous publication Temirbayeva et al., (2023a) and Temirbayeva et al., (2023b)
<- read.csv("https://zenodo.org/records/17075125/files/AZ-raw-coordinates.csv")
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/CY-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/GE-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/IQ-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/IR-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/SA-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/TJ-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/TR-raw-coordinates.csv"))
wings
<- read.csv("https://zenodo.org/records/17075125/files/AZ-data.csv")
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/CY-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/GE-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/IQ-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/IR-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/SA-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/TJ-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/TR-data.csv"))
geo.data
<- read.csv("https://zenodo.org/record/8128010/files/KZ-data.csv", strip.white = TRUE)
KZ.geo.data colnames(KZ.geo.data)[colnames(KZ.geo.data) == "group"] <- "notes"
<- read.csv("https://zenodo.org/record/8128010/files/KZ-raw-coordinates.csv")
KZ.wings # use only samples declared as 'local bees'
<- subset(KZ.wings, KZ.geo.data$notes == "local bee")
KZ.wings <- rbind(wings, KZ.wings)
wings <- subset(KZ.geo.data, notes == "local bee")
KZ.geo.data <- rbind(geo.data, KZ.geo.data)
geo.data
# extract classifiers
$sample <- substr(wings$file, 1, 7)
wings$country <- substr(wings$file, 1, 2)
wings$country <- gsub("AZ", "Azerbaijan", wings$country)
wings$country <- gsub("CY", "Cyprus", wings$country)
wings$country <- gsub("GE", "Georgia", wings$country)
wings$country <- gsub("IQ", "Iraq", wings$country)
wings$country <- gsub("IR", "Iran", wings$country)
wings$country <- gsub("KZ", "Kazakhstan", wings$country)
wings$country <- gsub("SA", "Saudi Arabia", wings$country)
wings$country <- gsub("TJ", "Tajikistan", wings$country)
wings$country <- gsub("TR", "Turkey", wings$country)
wings
# Aggregate geographic data by sample
<- aggregate(geo.data[c("latitude", "longitude")], by = list(wings$sample),
geoDataSample FUN = mean)
<- data.frame(geoDataSample, row.names = 1) # move column 1 to row names
geoDataSample $country <- substr(rownames(geoDataSample), 1, 2)
geoDataSample$country <- gsub("AZ", "Azerbaijan", geoDataSample$country)
geoDataSample$country <- gsub("CY", "Cyprus", geoDataSample$country)
geoDataSample$country <- gsub("GE", "Georgia", geoDataSample$country)
geoDataSample$country <- gsub("IQ", "Iraq", geoDataSample$country)
geoDataSample$country <- gsub("IR", "Iran", geoDataSample$country)
geoDataSample$country <- gsub("KZ", "Kazakhstan", geoDataSample$country)
geoDataSample$country <- gsub("SA", "Saudi Arabia", geoDataSample$country)
geoDataSample$country <- gsub("TJ", "Tajikistan", geoDataSample$country)
geoDataSample$country <- gsub("TR", "Turkey", geoDataSample$country)
geoDataSample
nrow(wings) # number of wings
[1] 17457
table(wings$country) # number of wings per country
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan
114 654 3579 3399 91 442
Saudi Arabia Tajikistan Turkey
1397 88 7693
nrow(geoDataSample) # number of samples
[1] 1557
table(geoDataSample$country) # number of samples per country
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan
14 73 102 401 11 29
Saudi Arabia Tajikistan Turkey
127 5 795
# number of workers per sample
<- table(wings$sample)
nColony <- data.frame(nColony, row.names = 1) # move column 1 to row names
nColony ggplot(nColony, aes(x = Freq)) + geom_histogram() + coord_cartesian(xlim = c(0, 40))
summary(nColony$Freq)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.0 8.0 9.0 11.2 10.0 40.0
# number of samples per location
<- paste(round(geoDataSample$latitude, 1), round(geoDataSample$longitude,
location 1))
<- table(location)
nLocation <- as.data.frame(nLocation)
nLocation
<- str_split_fixed(nLocation$location, " ", 2)
locationLatLon colnames(locationLatLon) <- c("latitude", "longitude")
<- apply(locationLatLon, 2, as.numeric)
locationLatLon <- as.data.frame(locationLatLon)
locationLatLon <- cbind(nLocation, locationLatLon) nLocation
Maps
# Read elevation data. In order to download the TIF file uncomment the two
# lines below.
# download.file('https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip',
# 'wc2.1_2.5m_elev.zip'); unzip('wc2.1_2.5m_elev.zip'); Or use your local file.
<- raster("D:/WorldClim/wc2.1_2.5m_elev.tif")
elev
<- colorRampPalette(c("#f7f7f7", "#d9d9d9", "#bdbdbd", "#636363", "#303030"),
elev.color bias = 2)
<- 25
x.min <- 83
x.max <- 15
y.min <- 46
y.max <- extent(x.min - 5, x.max + 5, y.min - 5, y.max + 5)
e <- crop(elev, e)
elev <- data.frame(rasterToPoints(elev))
elev.df colnames(elev.df) <- c("longitude", "latitude", "altitude")
<- ne_countries(scale = "medium", returnclass = "sf")
world # Extract Caspian See outline
<- ne_coastline(scale = "medium", returnclass = "sf")
coastline <- st_bbox(c(xmin = 46, xmax = 56, ymin = 36, ymax = 47))
bbox <- st_crop(coastline, bbox)
caspian $geometry <- sf::st_cast(caspian$geometry, "POLYGON") # convert to polygon
caspian
ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
geom_sf(data = caspian, fill = "white") + scale_fill_gradientn(colours = elev.color(100)) +
geom_sf(fill = NA) + coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) +
geom_jitter(data = geoDataSample, aes(x = longitude, y = latitude, color = country,
shape = country), size = 0.7, width = 0.3, height = 0.3) + scale_color_manual(name = "country",
values = rainbow(9)) + scale_shape_manual(name = "country", values = 1:9) + theme(legend.position = "right")
ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
geom_sf(data = caspian, fill = "white") + scale_fill_gradientn(colours = elev.color(100)) +
geom_sf(fill = NA) + coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) +
geom_point(data = nLocation, aes(x = longitude, y = latitude, size = Freq)) +
scale_size_binned_area(name = "colonies", limits = c(0, 100), breaks = c(0, 5,
10, 20, 40)) + theme(legend.position = "right")
GPA-alignment
# Convert from 2D array to 3D array
<- arrayspecs(wings[xyNames], p, k)
wings3D dimnames(wings3D)[[3]] <- wings$file
# Align the coordinates using Generalized Procrustes Analysis
<- gpagen(wings3D, print.progress = FALSE)
GPA
# plot landmarks after alignment
plotAllSpecimens(GPA$coords, links = links.apis, label = TRUE, plot_param = list(pt.bg = "black",
pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))
# Convert from 3D array to 2D array
<- two.d.array(GPA$coords)
wingsAligned colnames(wingsAligned) <- xyNames
<- aggregate(wingsAligned, by = list(wings$sample), FUN = mean)
samplesAligned <- data.frame(samplesAligned, row.names = 1) # move column 1 to row names samplesAligned
PCA colony
<- prcomp(samplesAligned[, xyNames])
PCA summary(PCA)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 0.0103 0.00784 0.00686 0.00503 0.00414 0.00376 0.0035
Proportion of Variance 0.2851 0.16607 0.12706 0.06836 0.04620 0.03810 0.0331
Cumulative Proportion 0.2851 0.45117 0.57823 0.64659 0.69278 0.73089 0.7640
PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 0.00347 0.00302 0.00286 0.00273 0.0027 0.00228 0.00219
Proportion of Variance 0.03247 0.02457 0.02212 0.02008 0.0197 0.01401 0.01293
Cumulative Proportion 0.79646 0.82103 0.84315 0.86324 0.8829 0.89689 0.90982
PC15 PC16 PC17 PC18 PC19 PC20 PC21
Standard deviation 0.00209 0.00201 0.00198 0.00173 0.00159 0.00142 0.00138
Proportion of Variance 0.01180 0.01087 0.01056 0.00811 0.00686 0.00543 0.00511
Cumulative Proportion 0.92162 0.93249 0.94306 0.95116 0.95802 0.96345 0.96856
PC22 PC23 PC24 PC25 PC26 PC27
Standard deviation 0.00134 0.00121 0.00111 0.00109 0.00105 0.000918
Proportion of Variance 0.00484 0.00393 0.00333 0.00318 0.00300 0.002280
Cumulative Proportion 0.97341 0.97733 0.98066 0.98384 0.98684 0.989120
PC28 PC29 PC30 PC31 PC32 PC33
Standard deviation 0.000906 0.000859 0.000835 0.000795 0.000682 0.000605
Proportion of Variance 0.002220 0.001990 0.001880 0.001710 0.001260 0.000990
Cumulative Proportion 0.991330 0.993320 0.995210 0.996920 0.998170 0.999160
PC34 PC35 PC36 PC37 PC38
Standard deviation 0.000558 4.43e-17 2.53e-17 1.88e-17 1.08e-17
Proportion of Variance 0.000840 0.00e+00 0.00e+00 0.00e+00 0.00e+00
Cumulative Proportion 1.000000 1.00e+00 1.00e+00 1.00e+00 1.00e+00
<- as.data.frame(PCA$x)
PCscores <- cbind(PCscores, geoDataSample)
PCscores
# create plot labels
<- summary(PCA)$importance
variance.tab <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
variance <- paste0("PC1 (", variance, "%)")
label.x <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
variance <- paste0("PC2 (", variance, "%)")
label.y
<- ggplot(PCscores, aes(x = PC1, y = PC2, shape = country, color = country)) +
figPca geom_point(size = 0.7) + scale_shape_manual(name = "country", values = 1:9) +
scale_color_manual(name = "country", values = rainbow(9)) + stat_ellipse() +
xlab(label.x) + ylab(label.y)
figPca
MANOVA
<- manova(as.matrix(cbind(PCscores[, pcNames])) ~ country, PCscores)
MANOVA summary(MANOVA)
Df Pillai approx F num Df den Df Pr(>F)
country 8 3.29 31.3 272 12176 <2e-16 ***
Residuals 1548
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
LDA
# use equal prior probability for all groups
<- length(unique(PCscores$country)) # number of groups
n.gr <- CVA(PCscores[pcNames], PCscores$country, rounds = 10000, cv = TRUE, prior = rep(1/n.gr,
LDA
n.gr))<- as.data.frame(LDA$CVscores)
LDscores # rename variable names from CV to LD otherwise use `CV 1`
colnames(LDscores) <- gsub("CV ", "LD", colnames(LDscores))
$country <- geoDataSample$country
LDscores
<- ggplot(LDscores, aes(x = LD1, y = LD2, shape = country, color = country)) +
figLda geom_point(size = 0.7) + scale_shape_manual(name = "country", values = 1:9) +
scale_color_manual(name = "country", values = rainbow(9)) + stat_ellipse()
figLda
ggarrange(figPca, figLda, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
# Classification of the samples to groups
<- typprobClass(LDA$CVscores, groups = as.factor(LDscores$country), outlier = 0)
LDclass print(LDclass)
cross-validated classification results in frequencies
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan Saudi Arabia
Azerbaijan 14 0 0 0 0 0 0
Cyprus 1 68 0 0 2 0 0
Georgia 0 0 102 0 0 0 0
Iran 0 0 0 398 0 2 0
Iraq 0 0 0 0 11 0 0
Kazakhstan 0 0 1 0 0 26 0
Saudi Arabia 0 0 0 0 0 0 127
Tajikistan 0 0 0 0 0 0 0
Turkey 5 8 8 2 5 12 0
Tajikistan Turkey
Azerbaijan 0 0
Cyprus 0 2
Georgia 0 0
Iran 1 0
Iraq 0 0
Kazakhstan 2 0
Saudi Arabia 0 0
Tajikistan 5 0
Turkey 1 754
cross-validated classification result in %
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan
Azerbaijan 100.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Cyprus 1.36986 93.15068 0.00000 0.00000 2.73973 0.00000
Georgia 0.00000 0.00000 100.00000 0.00000 0.00000 0.00000
Iran 0.00000 0.00000 0.00000 99.25187 0.00000 0.49875
Iraq 0.00000 0.00000 0.00000 0.00000 100.00000 0.00000
Kazakhstan 0.00000 0.00000 3.44828 0.00000 0.00000 89.65517
Saudi Arabia 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Tajikistan 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Turkey 0.62893 1.00629 1.00629 0.25157 0.62893 1.50943
Saudi Arabia Tajikistan Turkey
Azerbaijan 0.00000 0.00000 0.00000
Cyprus 0.00000 0.00000 2.73973
Georgia 0.00000 0.00000 0.00000
Iran 0.00000 0.24938 0.00000
Iraq 0.00000 0.00000 0.00000
Kazakhstan 0.00000 6.89655 0.00000
Saudi Arabia 100.00000 0.00000 0.00000
Tajikistan 0.00000 100.00000 0.00000
Turkey 0.00000 0.12579 94.84277
overall classification accuracy: 96.66024 %
Kappa statistic: 0.95027
# Mahalanobis distances between groups
::kable(as.data.frame(as.matrix(LDA$Dist$GroupdistMaha)), digits = 3) knitr
Azerbaijan | Cyprus | Georgia | Iran | Iraq | Kazakhstan | Saudi Arabia | Tajikistan | Turkey | |
---|---|---|---|---|---|---|---|---|---|
Azerbaijan | 0.00 | 5.16 | 7.71 | 6.67 | 4.62 | 7.76 | 10.60 | 7.38 | 4.93 |
Cyprus | 5.16 | 0.00 | 7.62 | 6.53 | 5.74 | 7.57 | 9.94 | 7.89 | 4.91 |
Georgia | 7.71 | 7.62 | 0.00 | 6.49 | 7.30 | 5.64 | 9.66 | 5.37 | 5.16 |
Iran | 6.67 | 6.53 | 6.49 | 0.00 | 6.48 | 5.28 | 9.38 | 5.12 | 6.24 |
Iraq | 4.62 | 5.74 | 7.30 | 6.48 | 0.00 | 6.92 | 11.34 | 7.06 | 5.31 |
Kazakhstan | 7.76 | 7.57 | 5.64 | 5.28 | 6.92 | 0.00 | 9.86 | 4.03 | 5.65 |
Saudi Arabia | 10.60 | 9.94 | 9.66 | 9.38 | 11.34 | 9.86 | 0.00 | 9.66 | 8.91 |
Tajikistan | 7.38 | 7.89 | 5.37 | 5.12 | 7.06 | 4.03 | 9.66 | 0.00 | 6.40 |
Turkey | 4.93 | 4.91 | 5.16 | 6.24 | 5.31 | 5.65 | 8.91 | 6.40 | 0.00 |
# Probabilities of differences between groups
::kable(as.data.frame(as.matrix(LDA$Dist$probsMaha)), digits = 5) knitr
Azerbaijan | Cyprus | Georgia | Iran | Iraq | Kazakhstan | Saudi Arabia | Tajikistan | Turkey | |
---|---|---|---|---|---|---|---|---|---|
Azerbaijan | 0.0000 | 1e-04 | 0.0001 | 0.0001 | 0.0015 | 0.0001 | 1e-04 | 0.0002 | 1e-04 |
Cyprus | 0.0001 | 0e+00 | 0.0001 | 0.0001 | 0.0001 | 0.0001 | 1e-04 | 0.0001 | 1e-04 |
Georgia | 0.0001 | 1e-04 | 0.0000 | 0.0001 | 0.0001 | 0.0001 | 1e-04 | 0.0031 | 1e-04 |
Iran | 0.0001 | 1e-04 | 0.0001 | 0.0000 | 0.0001 | 0.0001 | 1e-04 | 0.0044 | 1e-04 |
Iraq | 0.0015 | 1e-04 | 0.0001 | 0.0001 | 0.0000 | 0.0001 | 1e-04 | 0.0002 | 1e-04 |
Kazakhstan | 0.0001 | 1e-04 | 0.0001 | 0.0001 | 0.0001 | 0.0000 | 1e-04 | 0.1117 | 1e-04 |
Saudi Arabia | 0.0001 | 1e-04 | 0.0001 | 0.0001 | 0.0001 | 0.0001 | 0e+00 | 0.0001 | 1e-04 |
Tajikistan | 0.0002 | 1e-04 | 0.0031 | 0.0044 | 0.0002 | 0.1117 | 1e-04 | 0.0000 | 5e-04 |
Turkey | 0.0001 | 1e-04 | 0.0001 | 0.0001 | 0.0001 | 0.0001 | 1e-04 | 0.0005 | 0e+00 |
Classification of colonies as lineages
Comparison with lineages dataset Nawrocka et al., (2018a) and Nawrocka et al., (2018b)
<- xml2gmLdaData("https://zenodo.org/records/14054009/files/apis-mellifera-lineage.dw.xml")
idData <- gmLdaData2id(idData, samplesAligned[, xyNames], average = FALSE)
id $plot id
<- table(id$id$group)
tab tab
A C M O
383 260 85 829
prop.table(tab)
A C M O
0.2460 0.1670 0.0546 0.5324
<- id$id
idID $country <- substr(rownames(idID), 1, 2)
idID<- table(idID$group, idID$country)
tab tab
AZ CY GE IQ IR KZ SA TJ TR
A 0 2 7 0 146 6 127 0 95
C 2 43 0 1 107 11 0 3 93
M 0 8 0 0 52 0 0 0 25
O 12 20 95 10 96 12 0 2 582
prop.table(tab, margin = 2)
AZ CY GE IQ IR KZ SA TJ TR
A 0.0000 0.0274 0.0686 0.0000 0.3641 0.2069 1.0000 0.0000 0.1195
C 0.1429 0.5890 0.0000 0.0909 0.2668 0.3793 0.0000 0.6000 0.1170
M 0.0000 0.1096 0.0000 0.0000 0.1297 0.0000 0.0000 0.0000 0.0314
O 0.8571 0.2740 0.9314 0.9091 0.2394 0.4138 0.0000 0.4000 0.7321
<- id$MD2
idMD <- as.data.frame(id$P)
pTab <- rowSums(pTab)
sums <- pTab/sums
pTab
<- pTab
pTabLong $sample <- rownames(pTabLong)
pTabLong<- melt(pTabLong, id.vars = c("sample"))
pTabLong $gr <- substr(pTabLong$sample, 1, 2) # short 2 letter name
pTabLong
ggplot(pTabLong, aes(sample, value, fill = variable)) + geom_col(width = 1) + scale_fill_manual(values = rainbow(4),
name = "") + facet_grid(~gr, switch = "x", scales = "free", space = "free") +
theme_minimal() + labs(x = NULL, y = NULL) + scale_y_continuous(expand = c(0,
0)) + guides(fill = guide_legend(nrow = 1, byrow = TRUE)) + theme(panel.spacing.x = unit(0.2,
"lines"), axis.text.x = element_blank(), legend.position = "top")
$lineage <- idID$group
geoDataSample
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample, aes(x = longitude,
y = latitude, color = lineage, shape = lineage), width = 0.4, height = 0.4, size = 0.7) +
coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) + scale_color_manual(name = "",
values = rainbow(4)) + scale_shape_manual(name = "", values = 1:4) + guides(color = guide_legend(nrow = 1,
byrow = TRUE)) + theme(legend.position = "top")
# map of lineage A
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"A", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
ylim = c(y.min, y.max)) x.max),
# map of lineage C
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"C", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
ylim = c(y.min, y.max)) x.max),
# map of lineage M
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"M", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
ylim = c(y.min, y.max)) x.max),
# map of lineage O
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"O", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
ylim = c(y.min, y.max)) x.max),
Error of measurements
<- read.csv("https://zenodo.org/records/17075125/files/GE-30-3x.csv", row.names = 1)
wingsAdj <- arrayspecs(wingsAdj[xyNames], p, k)
wingsAdj3D dimnames(wingsAdj3D)[[3]] <- rownames(wingsAdj)
# Align the coordinates using Generalized Procrustes Analysis
<- gpagen(wingsAdj3D, print.progress = FALSE)
GPA <- two.d.array(GPA$coords)
wingsAdjAligned colnames(wingsAdjAligned) <- xyNames
<- as.data.frame(wingsAdjAligned)
wingsAdjAligned $colony <- substr(rownames(wingsAdj), 5, 7)
wingsAdjAligned$colony <- as.numeric(wingsAdjAligned$colony)
wingsAdjAligned$replication <- str_sub(rownames(wingsAdj), -8, -8)
wingsAdjAligned$colony_rep <- paste(wingsAdjAligned$colony, wingsAdjAligned$replication,
wingsAdjAlignedsep = "_")
<- aggregate(wingsAdjAligned[xyNames], by = list(wingsAdjAligned$colony_rep),
samplesAdj FUN = mean)
<- data.frame(samplesAdj, row.names = 1) # move column 1 to row names
samplesAdj
<- str_split_fixed(rownames(samplesAdj), "_", 2)
colonyRep colnames(colonyRep) <- c("colony", "replication")
<- cbind(samplesAdj, colonyRep)
samplesAdj table(samplesAdj$replication)
0 1 2
30 30 30
<- prcomp(samplesAdj[, xyNames])
PCA <- as.data.frame(PCA$x)
PCscores
$colony <- samplesAdj$colony
PCscores$group <- samplesAdj$replication
PCscores$point <- PCscores$group
PCscores
# create plot labels
<- summary(PCA)$importance
variance.tab <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
variance <- paste0("PC1 (", variance, "%)")
label.x <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
variance <- paste0("PC2 (", variance, "%)")
label.y
<- aggregate(PCscores[, c("PC1", "PC2")], by = list(PCscores$colony), FUN = mean)
repMean colnames(repMean)[1] <- "colony"
$point <- "mean"
repMean$group <- 0
repMean<- repMean
repMean1 $group <- 1
repMean1<- repMean
repMean2 $group <- 2
repMean2
<- rbind(PCscores[, c("PC1", "PC2", "colony", "group")], repMean[, c("PC1",
plotData "PC2", "colony", "group")], repMean1[, c("PC1", "PC2", "colony", "group")], repMean2[,
c("PC1", "PC2", "colony", "group")])
$colony_gr <- paste(plotData$colony, plotData$group, sep = "-")
plotData<- ggplot(plotData, aes(x = PC1, y = PC2, color = group, shape = group)) +
figME1 geom_point() + scale_color_manual(name = "replication", values = c(rainbow(3),
"black")) + scale_shape_manual(name = "replication", values = c(16, 16, 16)) +
geom_line(aes(group = colony_gr), color = "black") + xlab(label.x) + ylab(label.y) +
theme(legend.position = "none")
figME1
# Convert from 2D array to 3D array
<- arrayspecs(samplesAdj[xyNames], p, k)
samplesAdj3D dimnames(samplesAdj3D)[[3]] <- rownames(samplesAdj)
<- list(coordsarray = samplesAdj3D, colony = samplesAdj$colony, replication = samplesAdj$replication)
wingsAdjList
<- gm.measurement.error(coords = "coordsarray", subjects = "colony", replicates = "replication",
ME1 data = wingsAdjList)
anova(ME1)
Analysis of Variance, using Residual Randomization
Permutation procedure: Randomization of null model residuals
Number of permutations: 1000
Estimation method: Ordinary Least Squares
Sums of Squares and Cross-products: Type Within-subject II
Effect sizes (Z) based on SNR distributions
Df SS MS Rsq EtaSq.ME SNR Z Pr(>SNR)
Subjects 29 0.01039 0.000358 0.919 266.3 17.9 0.001 ***
Systematic ME 2 0.00087 0.000436 0.077 0.957 22.4 13.8 0.001 ***
Random ME 58 0.00004 0.000001 0.003 0.043
Total 89 0.01130
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call: Y ~ subjects + replicates
ICCstats(ME1, subjects = "Subjects", with_in = "Systematic ME")
Df SS MS Rsq EtaSq.ME SNR Z Pr(>SNR)
Subjects 29 0.01039 0.000358 0.919 266.3 17.9 0.001 ***
Systematic ME 2 0.00087 0.000436 0.077 0.957 22.4 13.8 0.001 ***
Random ME 58 0.00004 0.000001 0.003 0.043
Total 89 0.01130
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ICC stats based on dispersion of values
ICC ICC_a ICC_c
0.883 0.887 0.994
plot(ME1)
<- plot(ME1)
MEplot <- MEplot$data
plotPoints $x <- MEplot$x
plotPoints$y <- MEplot$y
plotPoints
$colony <- plotPoints$subjects
plotPoints$group <- plotPoints$replicates
plotPoints$point <- plotPoints$group
plotPoints
<- aggregate(plotPoints[, c("x", "y")], by = list(plotPoints$colony), FUN = mean)
repMean colnames(repMean)[1] <- "colony"
$point <- "mean"
repMean$group <- 0
repMean<- repMean
repMean1 $group <- 1
repMean1<- repMean
repMean2 $group <- 2
repMean2
<- rbind(plotPoints[, c("x", "y", "colony", "group", "point")], repMean[,
plotData c("x", "y", "colony", "group", "point")], repMean1[, c("x", "y", "colony", "group",
"point")], repMean2[, c("x", "y", "colony", "group", "point")])
$colony_gr <- paste(plotData$colony, plotData$group, sep = "-")
plotData<- ggplot(plotData, aes(x = x, y = y, color = point, shape = point)) + geom_point() +
figME2 scale_color_manual(name = "replication", values = c(rainbow(3), "black")) + scale_shape_manual(name = "replication",
values = c(16, 16, 16, 16)) + geom_line(aes(group = colony_gr), color = "black",
linewidth = 0.5) + xlab(MEplot$xlab) + ylab(MEplot$ylab) + theme(legend.position = "none")
figME2
ggarrange(figME1, figME2, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
ncol = 2, nrow = 1)
Procrustes distances
<- wings[wings$country == "Georgia", ]
GE30 $colony <- substr(GE30$file, 5, 7)
GE30$colony <- as.numeric(GE30$colony)
GE30<- data.frame(GE30, row.names = 1) # move column 1 to row names
GE30 <- GE30[GE30$colony < 31, c(xyNames, "colony")]
GE30 nrow(GE30) # number of wings
[1] 1049
# number of workers per colony
<- table(GE30$colony)
nColony <- as.data.frame(nColony)
nColony summary(nColony$Freq)
Min. 1st Qu. Median Mean 3rd Qu. Max.
30.0 34.2 36.0 35.0 36.0 36.0
# Procrustes distances between workers in 30 colonies
<- numeric(30)
pdVec for (i in 1:30) {
<- arrayspecs(GE30[GE30$colony == i, xyNames], p, k)
coords <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
Y.gpa <- Y.gpa$procD
D <- mean(D)
pdVec[i]
}summary(pdVec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0211 0.0220 0.0228 0.0231 0.0239 0.0283
# Procrustes distance between colonies
<- aggregate(GE30[, xyNames], by = list(GE30$colony), FUN = mean)
GE30colony <- data.frame(GE30colony, row.names = 1) # move column 1 to row names
GE30colony <- arrayspecs(GE30colony, p, k)
coords <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
Y.gpa # Pairwise Procrustes distances among specimens
<- Y.gpa$procD
D summary(D)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00696 0.01153 0.01379 0.01411 0.01617 0.02600
# Procrustes distance between replications
<- aggregate(wingsAdjAligned[, xyNames], by = list(wingsAdjAligned$replication),
coords FUN = mean)
<- data.frame(coords, row.names = 1) # move column 1 to row names
coords <- arrayspecs(coords, p, k)
coords <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
Y.gpa <- Y.gpa$procD
D summary(D)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00420 0.00491 0.00562 0.00532 0.00588 0.00614
# Procrustes distance between countries
<- aggregate(wingsAligned, by = list(wings$country), FUN = mean)
coords <- data.frame(coords, row.names = 1) # move column 1 to row names
coords <- arrayspecs(coords, p, k)
coords <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
Y.gpa <- Y.gpa$procD
D ::kable(as.data.frame(as.matrix(D)), digits = 4) knitr
Azerbaijan | Cyprus | Georgia | Iran | Iraq | Kazakhstan | Saudi Arabia | Tajikistan | Turkey | |
---|---|---|---|---|---|---|---|---|---|
Azerbaijan | 0.0000 | 0.0154 | 0.0195 | 0.0161 | 0.0108 | 0.0174 | 0.0291 | 0.0149 | 0.0145 |
Cyprus | 0.0154 | 0.0000 | 0.0182 | 0.0133 | 0.0170 | 0.0173 | 0.0235 | 0.0180 | 0.0132 |
Georgia | 0.0195 | 0.0182 | 0.0000 | 0.0205 | 0.0189 | 0.0117 | 0.0229 | 0.0155 | 0.0118 |
Iran | 0.0161 | 0.0133 | 0.0205 | 0.0000 | 0.0164 | 0.0151 | 0.0277 | 0.0128 | 0.0185 |
Iraq | 0.0108 | 0.0170 | 0.0189 | 0.0164 | 0.0000 | 0.0157 | 0.0321 | 0.0128 | 0.0162 |
Kazakhstan | 0.0174 | 0.0173 | 0.0117 | 0.0151 | 0.0157 | 0.0000 | 0.0236 | 0.0095 | 0.0127 |
Saudi Arabia | 0.0291 | 0.0235 | 0.0229 | 0.0277 | 0.0321 | 0.0236 | 0.0000 | 0.0284 | 0.0215 |
Tajikistan | 0.0149 | 0.0180 | 0.0155 | 0.0128 | 0.0128 | 0.0095 | 0.0284 | 0.0000 | 0.0163 |
Turkey | 0.0145 | 0.0132 | 0.0118 | 0.0185 | 0.0162 | 0.0127 | 0.0215 | 0.0163 | 0.0000 |
# D
summary(D)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00954 0.01419 0.01635 0.01775 0.01976 0.03213
Influence of sample size on classification results
# single colony
<- wingsAligned[geo.data$sample == "GE-0001", xyNames]
colony
# for each sample size create 36 replicates equal to number of workers in the
# colony
<- nrow(colony)
nrep <- 2^(0:5) # vector with sample size ranging from 1 to 32
nVec <- vector(mode = "list", length = length(nVec))
plotList <- data.frame(matrix(ncol = length(nVec), nrow = nrep))
classDf <- data.frame(matrix(ncol = length(nVec), nrow = nrep))
OpDf set.seed(1)
for (i in 1:length(nVec)) {
<- nVec[i]
n <- replicate(nrep, colMeans(colony[sample(1:nrow(colony), n, replace = FALSE),
replicates drop = FALSE]))
, <- t(replicates)
replicates
<- gmLdaData2id(idData, replicates, average = FALSE)
id <- id$plot # add plot to list
plotList[[i]] <- id$id$group
classDf[, i] <- id$P[, 4]
OpDf[, i]
}
<- letters[1:6]
plotLabels <- paste(plotLabels, nVec, sep = ", n=")
plotLabels ggarrange(plotlist = plotList, labels = plotLabels, font.label = list(size = 12,
face = "bold"), ncol = 2, nrow = 3)
classDf
X1 X2 X3 X4 X5 X6
1 O O O O O O
2 O O O O O O
3 O A O O O O
4 A O O O O O
5 O O O O O O
6 O O O O O O
7 O A O O O O
8 O O O O O O
9 O O O O O O
10 O O O O O O
11 O O O O O O
12 O O O O O O
13 O O O O O O
14 O O O O O O
15 O O O O O O
16 O O O O O O
17 O O O O O O
18 O O O O O O
19 O O O O O O
20 O O O O O O
21 O O O O O O
22 O O O O O O
23 O O O O O O
24 O O O O O O
25 O O O O O O
26 O O O O O O
27 A O O O O O
28 O O O O O O
29 O O O O O O
30 O O O O O O
31 O O O O O O
32 O O O O O O
33 O O O O O O
34 O O O O O O
35 O O O O O O
36 O O O O O O
References
Machlowska, J., Kandemir, I., Özkan Koca, A., Kakhniashvili, V., Alattal, Y., Alghamdi, A., & Tofilski, A. (2025). Fore wings of honey bees (Apis mellifera) from southwestern Asia [Data set]. Zenodo. https://doi.org/10.5281/zenodo.17075125
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336
Temirbayeva, K., Torekhanov, A., Nuralieva, U., Sheralieva, Z., & Tofilski, A. (2023). In Search of Apis mellifera pomonella in Kazakhstan. Life, 13(9), 1860. https://doi.org/10.3390/life13091860
Temirbayeva, K., Torekhanov, A., Nuralieva, U., Sheralieva, Z., & Tofilski, A. (2023). Fore wings of honey bees (Apis mellifera) from Kazakhstan [Data set]. Zenodo. https://doi.org/10.5281/zenodo.8128010
Session information
sessionInfo()
R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22000)
Matrix products: default
LAPACK version 3.12.1
locale:
[1] LC_COLLATE=Polish_Poland.utf8 LC_CTYPE=Polish_Poland.utf8
[3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C
[5] LC_TIME=Polish_Poland.utf8
time zone: Europe/Warsaw
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggpubr_0.6.0 sf_1.0-21 raster_3.6-32
[4] sp_2.2-0 rnaturalearth_1.0.1 ggplot2_3.5.2
[7] stringr_1.5.1 IdentiFlyR_0.1.1 reshape2_1.4.4
[10] Morpho_2.12 geomorph_4.0.10 Matrix_1.7-3
[13] rgl_1.3.18 RRPP_2.1.2
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 dplyr_1.1.4 farver_2.1.2
[4] fastmap_1.2.0 tweenr_2.0.3 digest_0.6.37
[7] lifecycle_1.0.4 terra_1.8-50 bezier_1.1.2
[10] magrittr_2.0.3 compiler_4.5.0 rlang_1.1.6
[13] tools_4.5.0 yaml_2.3.10 knitr_1.50
[16] ggsignif_0.6.4 labeling_0.4.3 htmlwidgets_1.6.4
[19] scatterplot3d_0.3-44 classInt_0.4-11 curl_6.2.2
[22] plyr_1.8.9 xml2_1.3.8 RColorBrewer_1.1-3
[25] shapes_1.2.7 abind_1.4-8 KernSmooth_2.23-26
[28] withr_3.0.2 purrr_1.0.4 polyclip_1.10-7
[31] grid_4.5.0 e1071_1.7-16 scales_1.4.0
[34] iterators_1.0.14 MASS_7.3-65 cli_3.6.5
[37] rmarkdown_2.29 generics_0.1.4 rstudioapi_0.17.1
[40] httr_1.4.7 Rvcg_0.25 ggforce_0.4.2
[43] DBI_1.2.3 ape_5.8-1 proxy_0.4-27
[46] parallel_4.5.0 formatR_1.14 s2_1.1.8
[49] base64enc_0.1-3 vctrs_0.6.5 minpack.lm_1.2-4
[52] jsonlite_2.0.0 carData_3.0-5 car_3.1-3
[55] ggrepel_0.9.6 rstatix_0.7.2 Formula_1.2-5
[58] jpeg_0.1-11 foreach_1.5.2 tidyr_1.3.1
[61] units_0.8-7 colorRamps_2.3.4 glue_1.8.0
[64] codetools_0.2-20 cowplot_1.1.3 stringi_1.8.7
[67] gtable_0.3.6 tibble_3.2.1 pillar_1.10.2
[70] htmltools_0.5.8.1 rnaturalearthdata_1.0.0 R6_2.6.1
[73] wk_0.9.4 doParallel_1.0.17 evaluate_1.0.3
[76] lattice_0.22-6 backports_1.5.0 broom_1.0.8
[79] class_7.3-23 Rcpp_1.0.14 gridExtra_2.3
[82] nlme_3.1-168 xfun_0.52 pkgconfig_2.0.3