# calculations
library(geomorph) # GPA
library(Morpho) # CVA
library(reshape2) # melt
library(IdentiFlyR) # classification
library(stringr) #str_split_fixed
# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(rnaturalearth) # maps
library(raster) # raster
library(sf) # st_bbox
library(ggpubr) # ggarangeAnalysis of honey bee wing shape in southwestern Asia
Libraries
Variable names
p <- 19 # number of landmarks
k <- 2 # number of dimensions, in this case 2 for coordinates (x, y)
# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
xyNames <- c(xyNames, paste0("x", i))
xyNames <- c(xyNames, paste0("y", i))
}
xyNames [1] "x1" "y1" "x2" "y2" "x3" "y3" "x4" "y4" "x5" "y5" "x6" "y6"
[13] "x7" "y7" "x8" "y8" "x9" "y9" "x10" "y10" "x11" "y11" "x12" "y12"
[25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
[37] "x19" "y19"
# The number of principal components used is 2*p-4 = 34, which is equal to the
# degrees of freedom create principal components names used by prcomp
pcNames <- paste0("PC", 1:(2 * p - 4))
pcNames [1] "PC1" "PC2" "PC3" "PC4" "PC5" "PC6" "PC7" "PC8" "PC9" "PC10"
[11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
[21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
[31] "PC31" "PC32" "PC33" "PC34"
# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14,
15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13,
18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)Read data
Read raw coordinates of 19 landmarks from Zenodo (Machlowska et al., 2025) Data for Kazakhstan refer to previous publication Temirbayeva et al., (2023a) and Temirbayeva et al., (2023b)
wings <- read.csv("https://zenodo.org/records/17075125/files/AZ-raw-coordinates.csv")
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/CY-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/GE-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/IQ-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/IR-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/SA-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/TJ-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/TR-raw-coordinates.csv"))
geo.data <- read.csv("https://zenodo.org/records/17075125/files/AZ-data.csv")
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/CY-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/GE-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/IQ-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/IR-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/SA-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/TJ-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/TR-data.csv"))
KZ.geo.data <- read.csv("https://zenodo.org/record/8128010/files/KZ-data.csv", strip.white = TRUE)
colnames(KZ.geo.data)[colnames(KZ.geo.data) == "group"] <- "notes"
KZ.wings <- read.csv("https://zenodo.org/record/8128010/files/KZ-raw-coordinates.csv")
# use only samples declared as 'local bees'
KZ.wings <- subset(KZ.wings, KZ.geo.data$notes == "local bee")
wings <- rbind(wings, KZ.wings)
KZ.geo.data <- subset(KZ.geo.data, notes == "local bee")
geo.data <- rbind(geo.data, KZ.geo.data)
# extract classifiers
wings$sample <- substr(wings$file, 1, 7)
wings$country <- substr(wings$file, 1, 2)
wings$country <- gsub("AZ", "Azerbaijan", wings$country)
wings$country <- gsub("CY", "Cyprus", wings$country)
wings$country <- gsub("GE", "Georgia", wings$country)
wings$country <- gsub("IQ", "Iraq", wings$country)
wings$country <- gsub("IR", "Iran", wings$country)
wings$country <- gsub("KZ", "Kazakhstan", wings$country)
wings$country <- gsub("SA", "Saudi Arabia", wings$country)
wings$country <- gsub("TJ", "Tajikistan", wings$country)
wings$country <- gsub("TR", "Turkey", wings$country)
# Aggregate geographic data by sample
geoDataSample <- aggregate(geo.data[c("latitude", "longitude")], by = list(wings$sample),
FUN = mean)
geoDataSample <- data.frame(geoDataSample, row.names = 1) # move column 1 to row names
geoDataSample$country <- substr(rownames(geoDataSample), 1, 2)
geoDataSample$country <- gsub("AZ", "Azerbaijan", geoDataSample$country)
geoDataSample$country <- gsub("CY", "Cyprus", geoDataSample$country)
geoDataSample$country <- gsub("GE", "Georgia", geoDataSample$country)
geoDataSample$country <- gsub("IQ", "Iraq", geoDataSample$country)
geoDataSample$country <- gsub("IR", "Iran", geoDataSample$country)
geoDataSample$country <- gsub("KZ", "Kazakhstan", geoDataSample$country)
geoDataSample$country <- gsub("SA", "Saudi Arabia", geoDataSample$country)
geoDataSample$country <- gsub("TJ", "Tajikistan", geoDataSample$country)
geoDataSample$country <- gsub("TR", "Turkey", geoDataSample$country)
nrow(wings) # number of wings[1] 17457
table(wings$country) # number of wings per country
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan
114 654 3579 3399 91 442
Saudi Arabia Tajikistan Turkey
1397 88 7693
nrow(geoDataSample) # number of samples[1] 1557
table(geoDataSample$country) # number of samples per country
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan
14 73 102 401 11 29
Saudi Arabia Tajikistan Turkey
127 5 795
# number of workers per sample
nColony <- table(wings$sample)
nColony <- data.frame(nColony, row.names = 1) # move column 1 to row names
ggplot(nColony, aes(x = Freq)) + geom_histogram() + coord_cartesian(xlim = c(0, 40))summary(nColony$Freq) Min. 1st Qu. Median Mean 3rd Qu. Max.
4.0 8.0 9.0 11.2 10.0 40.0
# number of samples per location
location <- paste(round(geoDataSample$latitude, 1), round(geoDataSample$longitude,
1))
nLocation <- table(location)
nLocation <- as.data.frame(nLocation)
locationLatLon <- str_split_fixed(nLocation$location, " ", 2)
colnames(locationLatLon) <- c("latitude", "longitude")
locationLatLon <- apply(locationLatLon, 2, as.numeric)
locationLatLon <- as.data.frame(locationLatLon)
nLocation <- cbind(nLocation, locationLatLon)Maps
# Read elevation data. In order to download the TIF file uncomment the two
# lines below.
# download.file('https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip',
# 'wc2.1_2.5m_elev.zip'); unzip('wc2.1_2.5m_elev.zip'); Or use your local file.
elev <- raster("D:/WorldClim/wc2.1_2.5m_elev.tif")
elev.color <- colorRampPalette(c("#f7f7f7", "#d9d9d9", "#bdbdbd", "#636363", "#303030"),
bias = 2)
x.min <- 25
x.max <- 83
y.min <- 15
y.max <- 46
e <- extent(x.min - 5, x.max + 5, y.min - 5, y.max + 5)
elev <- crop(elev, e)
elev.df <- data.frame(rasterToPoints(elev))
colnames(elev.df) <- c("longitude", "latitude", "altitude")
world <- ne_countries(scale = "medium", returnclass = "sf")
# Extract Caspian See outline
coastline <- ne_coastline(scale = "medium", returnclass = "sf")
bbox <- st_bbox(c(xmin = 46, xmax = 56, ymin = 36, ymax = 47))
caspian <- st_crop(coastline, bbox)
caspian$geometry <- sf::st_cast(caspian$geometry, "POLYGON") # convert to polygon
ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
geom_sf(data = caspian, fill = "white") + scale_fill_gradientn(colours = elev.color(100)) +
geom_sf(fill = NA) + coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) +
geom_jitter(data = geoDataSample, aes(x = longitude, y = latitude, color = country,
shape = country), size = 0.7, width = 0.3, height = 0.3) + scale_color_manual(name = "country",
values = rainbow(9)) + scale_shape_manual(name = "country", values = 1:9) + theme(legend.position = "right")ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
geom_sf(data = caspian, fill = "white") + scale_fill_gradientn(colours = elev.color(100)) +
geom_sf(fill = NA) + coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) +
geom_point(data = nLocation, aes(x = longitude, y = latitude, size = Freq)) +
scale_size_binned_area(name = "colonies", limits = c(0, 100), breaks = c(0, 5,
10, 20, 40)) + theme(legend.position = "right")GPA-alignment
# Convert from 2D array to 3D array
wings3D <- arrayspecs(wings[xyNames], p, k)
dimnames(wings3D)[[3]] <- wings$file
# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wings3D, print.progress = FALSE)
# plot landmarks after alignment
plotAllSpecimens(GPA$coords, links = links.apis, label = TRUE, plot_param = list(pt.bg = "black",
pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))# Convert from 3D array to 2D array
wingsAligned <- two.d.array(GPA$coords)
colnames(wingsAligned) <- xyNames
samplesAligned <- aggregate(wingsAligned, by = list(wings$sample), FUN = mean)
samplesAligned <- data.frame(samplesAligned, row.names = 1) # move column 1 to row namesPCA colony
PCA <- prcomp(samplesAligned[, xyNames])
summary(PCA)Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 0.0103 0.00784 0.00686 0.00503 0.00414 0.00376 0.0035
Proportion of Variance 0.2851 0.16607 0.12706 0.06836 0.04620 0.03810 0.0331
Cumulative Proportion 0.2851 0.45117 0.57823 0.64659 0.69278 0.73089 0.7640
PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 0.00347 0.00302 0.00286 0.00273 0.0027 0.00228 0.00219
Proportion of Variance 0.03247 0.02457 0.02212 0.02008 0.0197 0.01401 0.01293
Cumulative Proportion 0.79646 0.82103 0.84315 0.86324 0.8829 0.89689 0.90982
PC15 PC16 PC17 PC18 PC19 PC20 PC21
Standard deviation 0.00209 0.00201 0.00198 0.00173 0.00159 0.00142 0.00138
Proportion of Variance 0.01180 0.01087 0.01056 0.00811 0.00686 0.00543 0.00511
Cumulative Proportion 0.92162 0.93249 0.94306 0.95116 0.95802 0.96345 0.96856
PC22 PC23 PC24 PC25 PC26 PC27
Standard deviation 0.00134 0.00121 0.00111 0.00109 0.00105 0.000918
Proportion of Variance 0.00484 0.00393 0.00333 0.00318 0.00300 0.002280
Cumulative Proportion 0.97341 0.97733 0.98066 0.98384 0.98684 0.989120
PC28 PC29 PC30 PC31 PC32 PC33
Standard deviation 0.000906 0.000859 0.000835 0.000795 0.000682 0.000605
Proportion of Variance 0.002220 0.001990 0.001880 0.001710 0.001260 0.000990
Cumulative Proportion 0.991330 0.993320 0.995210 0.996920 0.998170 0.999160
PC34 PC35 PC36 PC37 PC38
Standard deviation 0.000558 4.43e-17 2.53e-17 1.88e-17 1.08e-17
Proportion of Variance 0.000840 0.00e+00 0.00e+00 0.00e+00 0.00e+00
Cumulative Proportion 1.000000 1.00e+00 1.00e+00 1.00e+00 1.00e+00
PCscores <- as.data.frame(PCA$x)
PCscores <- cbind(PCscores, geoDataSample)
# create plot labels
variance.tab <- summary(PCA)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")
figPca <- ggplot(PCscores, aes(x = PC1, y = PC2, shape = country, color = country)) +
geom_point(size = 0.7) + scale_shape_manual(name = "country", values = 1:9) +
scale_color_manual(name = "country", values = rainbow(9)) + stat_ellipse() +
xlab(label.x) + ylab(label.y)
figPcaMANOVA
MANOVA <- manova(as.matrix(cbind(PCscores[, pcNames])) ~ country, PCscores)
summary(MANOVA) Df Pillai approx F num Df den Df Pr(>F)
country 8 3.29 31.3 272 12176 <2e-16 ***
Residuals 1548
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
LDA
# use equal prior probability for all groups
n.gr <- length(unique(PCscores$country)) # number of groups
LDA <- CVA(PCscores[pcNames], PCscores$country, rounds = 10000, cv = TRUE, prior = rep(1/n.gr,
n.gr))
LDscores <- as.data.frame(LDA$CVscores)
# rename variable names from CV to LD otherwise use `CV 1`
colnames(LDscores) <- gsub("CV ", "LD", colnames(LDscores))
LDscores$country <- geoDataSample$country
figLda <- ggplot(LDscores, aes(x = LD1, y = LD2, shape = country, color = country)) +
geom_point(size = 0.7) + scale_shape_manual(name = "country", values = 1:9) +
scale_color_manual(name = "country", values = rainbow(9)) + stat_ellipse()
figLdaggarrange(figPca, figLda, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")# Classification of the samples to groups
LDclass <- typprobClass(LDA$CVscores, groups = as.factor(LDscores$country), outlier = 0)
print(LDclass) cross-validated classification results in frequencies
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan Saudi Arabia
Azerbaijan 14 0 0 0 0 0 0
Cyprus 1 68 0 0 2 0 0
Georgia 0 0 102 0 0 0 0
Iran 0 0 0 398 0 2 0
Iraq 0 0 0 0 11 0 0
Kazakhstan 0 0 1 0 0 26 0
Saudi Arabia 0 0 0 0 0 0 127
Tajikistan 0 0 0 0 0 0 0
Turkey 5 8 8 2 5 12 0
Tajikistan Turkey
Azerbaijan 0 0
Cyprus 0 2
Georgia 0 0
Iran 1 0
Iraq 0 0
Kazakhstan 2 0
Saudi Arabia 0 0
Tajikistan 5 0
Turkey 1 754
cross-validated classification result in %
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan
Azerbaijan 100.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Cyprus 1.36986 93.15068 0.00000 0.00000 2.73973 0.00000
Georgia 0.00000 0.00000 100.00000 0.00000 0.00000 0.00000
Iran 0.00000 0.00000 0.00000 99.25187 0.00000 0.49875
Iraq 0.00000 0.00000 0.00000 0.00000 100.00000 0.00000
Kazakhstan 0.00000 0.00000 3.44828 0.00000 0.00000 89.65517
Saudi Arabia 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Tajikistan 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
Turkey 0.62893 1.00629 1.00629 0.25157 0.62893 1.50943
Saudi Arabia Tajikistan Turkey
Azerbaijan 0.00000 0.00000 0.00000
Cyprus 0.00000 0.00000 2.73973
Georgia 0.00000 0.00000 0.00000
Iran 0.00000 0.24938 0.00000
Iraq 0.00000 0.00000 0.00000
Kazakhstan 0.00000 6.89655 0.00000
Saudi Arabia 100.00000 0.00000 0.00000
Tajikistan 0.00000 100.00000 0.00000
Turkey 0.00000 0.12579 94.84277
overall classification accuracy: 96.66024 %
Kappa statistic: 0.95027
# Mahalanobis distances between groups
knitr::kable(as.data.frame(as.matrix(LDA$Dist$GroupdistMaha)), digits = 3)| Azerbaijan | Cyprus | Georgia | Iran | Iraq | Kazakhstan | Saudi Arabia | Tajikistan | Turkey | |
|---|---|---|---|---|---|---|---|---|---|
| Azerbaijan | 0.00 | 5.16 | 7.71 | 6.67 | 4.62 | 7.76 | 10.60 | 7.38 | 4.93 |
| Cyprus | 5.16 | 0.00 | 7.62 | 6.53 | 5.74 | 7.57 | 9.94 | 7.89 | 4.91 |
| Georgia | 7.71 | 7.62 | 0.00 | 6.49 | 7.30 | 5.64 | 9.66 | 5.37 | 5.16 |
| Iran | 6.67 | 6.53 | 6.49 | 0.00 | 6.48 | 5.28 | 9.38 | 5.12 | 6.24 |
| Iraq | 4.62 | 5.74 | 7.30 | 6.48 | 0.00 | 6.92 | 11.34 | 7.06 | 5.31 |
| Kazakhstan | 7.76 | 7.57 | 5.64 | 5.28 | 6.92 | 0.00 | 9.86 | 4.03 | 5.65 |
| Saudi Arabia | 10.60 | 9.94 | 9.66 | 9.38 | 11.34 | 9.86 | 0.00 | 9.66 | 8.91 |
| Tajikistan | 7.38 | 7.89 | 5.37 | 5.12 | 7.06 | 4.03 | 9.66 | 0.00 | 6.40 |
| Turkey | 4.93 | 4.91 | 5.16 | 6.24 | 5.31 | 5.65 | 8.91 | 6.40 | 0.00 |
# Probabilities of differences between groups
knitr::kable(as.data.frame(as.matrix(LDA$Dist$probsMaha)), digits = 5)| Azerbaijan | Cyprus | Georgia | Iran | Iraq | Kazakhstan | Saudi Arabia | Tajikistan | Turkey | |
|---|---|---|---|---|---|---|---|---|---|
| Azerbaijan | 0.0000 | 1e-04 | 0.0001 | 0.0001 | 0.0015 | 0.0001 | 1e-04 | 0.0002 | 1e-04 |
| Cyprus | 0.0001 | 0e+00 | 0.0001 | 0.0001 | 0.0001 | 0.0001 | 1e-04 | 0.0001 | 1e-04 |
| Georgia | 0.0001 | 1e-04 | 0.0000 | 0.0001 | 0.0001 | 0.0001 | 1e-04 | 0.0031 | 1e-04 |
| Iran | 0.0001 | 1e-04 | 0.0001 | 0.0000 | 0.0001 | 0.0001 | 1e-04 | 0.0044 | 1e-04 |
| Iraq | 0.0015 | 1e-04 | 0.0001 | 0.0001 | 0.0000 | 0.0001 | 1e-04 | 0.0002 | 1e-04 |
| Kazakhstan | 0.0001 | 1e-04 | 0.0001 | 0.0001 | 0.0001 | 0.0000 | 1e-04 | 0.1117 | 1e-04 |
| Saudi Arabia | 0.0001 | 1e-04 | 0.0001 | 0.0001 | 0.0001 | 0.0001 | 0e+00 | 0.0001 | 1e-04 |
| Tajikistan | 0.0002 | 1e-04 | 0.0031 | 0.0044 | 0.0002 | 0.1117 | 1e-04 | 0.0000 | 5e-04 |
| Turkey | 0.0001 | 1e-04 | 0.0001 | 0.0001 | 0.0001 | 0.0001 | 1e-04 | 0.0005 | 0e+00 |
Classification of colonies as lineages
Comparison with lineages dataset Nawrocka et al., (2018a) and Nawrocka et al., (2018b)
idData <- xml2gmLdaData("https://zenodo.org/records/14054009/files/apis-mellifera-lineage.dw.xml")
id <- gmLdaData2id(idData, samplesAligned[, xyNames], average = FALSE)
id$plottab <- table(id$id$group)
tab
A C M O
383 260 85 829
prop.table(tab)
A C M O
0.2460 0.1670 0.0546 0.5324
idID <- id$id
idID$country <- substr(rownames(idID), 1, 2)
tab <- table(idID$group, idID$country)
tab
AZ CY GE IQ IR KZ SA TJ TR
A 0 2 7 0 146 6 127 0 95
C 2 43 0 1 107 11 0 3 93
M 0 8 0 0 52 0 0 0 25
O 12 20 95 10 96 12 0 2 582
prop.table(tab, margin = 2)
AZ CY GE IQ IR KZ SA TJ TR
A 0.0000 0.0274 0.0686 0.0000 0.3641 0.2069 1.0000 0.0000 0.1195
C 0.1429 0.5890 0.0000 0.0909 0.2668 0.3793 0.0000 0.6000 0.1170
M 0.0000 0.1096 0.0000 0.0000 0.1297 0.0000 0.0000 0.0000 0.0314
O 0.8571 0.2740 0.9314 0.9091 0.2394 0.4138 0.0000 0.4000 0.7321
idMD <- id$MD2
pTab <- as.data.frame(id$P)
sums <- rowSums(pTab)
pTab <- pTab/sums
pTabLong <- pTab
pTabLong$sample <- rownames(pTabLong)
pTabLong <- melt(pTabLong, id.vars = c("sample"))
pTabLong$gr <- substr(pTabLong$sample, 1, 2) # short 2 letter name
ggplot(pTabLong, aes(sample, value, fill = variable)) + geom_col(width = 1) + scale_fill_manual(values = rainbow(4),
name = "") + facet_grid(~gr, switch = "x", scales = "free", space = "free") +
theme_minimal() + labs(x = NULL, y = NULL) + scale_y_continuous(expand = c(0,
0)) + guides(fill = guide_legend(nrow = 1, byrow = TRUE)) + theme(panel.spacing.x = unit(0.2,
"lines"), axis.text.x = element_blank(), legend.position = "top")geoDataSample$lineage <- idID$group
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample, aes(x = longitude,
y = latitude, color = lineage, shape = lineage), width = 0.4, height = 0.4, size = 0.7) +
coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) + scale_color_manual(name = "",
values = rainbow(4)) + scale_shape_manual(name = "", values = 1:4) + guides(color = guide_legend(nrow = 1,
byrow = TRUE)) + theme(legend.position = "top")# map of lineage A
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"A", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
x.max), ylim = c(y.min, y.max))# map of lineage C
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"C", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
x.max), ylim = c(y.min, y.max))# map of lineage M
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"M", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
x.max), ylim = c(y.min, y.max))# map of lineage O
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
"O", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
x.max), ylim = c(y.min, y.max))Error of measurements
wingsAdj <- read.csv("https://zenodo.org/records/17075125/files/GE-30-3x.csv", row.names = 1)
wingsAdj3D <- arrayspecs(wingsAdj[xyNames], p, k)
dimnames(wingsAdj3D)[[3]] <- rownames(wingsAdj)
# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wingsAdj3D, print.progress = FALSE)
wingsAdjAligned <- two.d.array(GPA$coords)
colnames(wingsAdjAligned) <- xyNames
wingsAdjAligned <- as.data.frame(wingsAdjAligned)
wingsAdjAligned$colony <- substr(rownames(wingsAdj), 5, 7)
wingsAdjAligned$colony <- as.numeric(wingsAdjAligned$colony)
wingsAdjAligned$replication <- str_sub(rownames(wingsAdj), -8, -8)
wingsAdjAligned$colony_rep <- paste(wingsAdjAligned$colony, wingsAdjAligned$replication,
sep = "_")
samplesAdj <- aggregate(wingsAdjAligned[xyNames], by = list(wingsAdjAligned$colony_rep),
FUN = mean)
samplesAdj <- data.frame(samplesAdj, row.names = 1) # move column 1 to row names
colonyRep <- str_split_fixed(rownames(samplesAdj), "_", 2)
colnames(colonyRep) <- c("colony", "replication")
samplesAdj <- cbind(samplesAdj, colonyRep)
table(samplesAdj$replication)
0 1 2
30 30 30
PCA <- prcomp(samplesAdj[, xyNames])
PCscores <- as.data.frame(PCA$x)
PCscores$colony <- samplesAdj$colony
PCscores$group <- samplesAdj$replication
PCscores$point <- PCscores$group
# create plot labels
variance.tab <- summary(PCA)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")
repMean <- aggregate(PCscores[, c("PC1", "PC2")], by = list(PCscores$colony), FUN = mean)
colnames(repMean)[1] <- "colony"
repMean$point <- "mean"
repMean$group <- 0
repMean1 <- repMean
repMean1$group <- 1
repMean2 <- repMean
repMean2$group <- 2
plotData <- rbind(PCscores[, c("PC1", "PC2", "colony", "group")], repMean[, c("PC1",
"PC2", "colony", "group")], repMean1[, c("PC1", "PC2", "colony", "group")], repMean2[,
c("PC1", "PC2", "colony", "group")])
plotData$colony_gr <- paste(plotData$colony, plotData$group, sep = "-")
figME1 <- ggplot(plotData, aes(x = PC1, y = PC2, color = group, shape = group)) +
geom_point() + scale_color_manual(name = "replication", values = c(rainbow(3),
"black")) + scale_shape_manual(name = "replication", values = c(16, 16, 16)) +
geom_line(aes(group = colony_gr), color = "black") + xlab(label.x) + ylab(label.y) +
theme(legend.position = "none")
figME1# Convert from 2D array to 3D array
samplesAdj3D <- arrayspecs(samplesAdj[xyNames], p, k)
dimnames(samplesAdj3D)[[3]] <- rownames(samplesAdj)
wingsAdjList <- list(coordsarray = samplesAdj3D, colony = samplesAdj$colony, replication = samplesAdj$replication)
ME1 <- gm.measurement.error(coords = "coordsarray", subjects = "colony", replicates = "replication",
data = wingsAdjList)
anova(ME1)
Analysis of Variance, using Residual Randomization
Permutation procedure: Randomization of null model residuals
Number of permutations: 1000
Estimation method: Ordinary Least Squares
Sums of Squares and Cross-products: Type Within-subject II
Effect sizes (Z) based on SNR distributions
Df SS MS Rsq EtaSq.ME SNR Z Pr(>SNR)
Subjects 29 0.01039 0.000358 0.919 266.3 17.9 0.001 ***
Systematic ME 2 0.00087 0.000436 0.077 0.957 22.4 13.8 0.001 ***
Random ME 58 0.00004 0.000001 0.003 0.043
Total 89 0.01130
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call: Y ~ subjects + replicates
ICCstats(ME1, subjects = "Subjects", with_in = "Systematic ME") Df SS MS Rsq EtaSq.ME SNR Z Pr(>SNR)
Subjects 29 0.01039 0.000358 0.919 266.3 17.9 0.001 ***
Systematic ME 2 0.00087 0.000436 0.077 0.957 22.4 13.8 0.001 ***
Random ME 58 0.00004 0.000001 0.003 0.043
Total 89 0.01130
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ICC stats based on dispersion of values
ICC ICC_a ICC_c
0.883 0.887 0.994
plot(ME1)MEplot <- plot(ME1)
plotPoints <- MEplot$data
plotPoints$x <- MEplot$x
plotPoints$y <- MEplot$y
plotPoints$colony <- plotPoints$subjects
plotPoints$group <- plotPoints$replicates
plotPoints$point <- plotPoints$group
repMean <- aggregate(plotPoints[, c("x", "y")], by = list(plotPoints$colony), FUN = mean)
colnames(repMean)[1] <- "colony"
repMean$point <- "mean"
repMean$group <- 0
repMean1 <- repMean
repMean1$group <- 1
repMean2 <- repMean
repMean2$group <- 2
plotData <- rbind(plotPoints[, c("x", "y", "colony", "group", "point")], repMean[,
c("x", "y", "colony", "group", "point")], repMean1[, c("x", "y", "colony", "group",
"point")], repMean2[, c("x", "y", "colony", "group", "point")])
plotData$colony_gr <- paste(plotData$colony, plotData$group, sep = "-")
figME2 <- ggplot(plotData, aes(x = x, y = y, color = point, shape = point)) + geom_point() +
scale_color_manual(name = "replication", values = c(rainbow(3), "black")) + scale_shape_manual(name = "replication",
values = c(16, 16, 16, 16)) + geom_line(aes(group = colony_gr), color = "black",
linewidth = 0.5) + xlab(MEplot$xlab) + ylab(MEplot$ylab) + theme(legend.position = "none")
figME2ggarrange(figME1, figME2, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
ncol = 2, nrow = 1)Procrustes distances
GE30 <- wings[wings$country == "Georgia", ]
GE30$colony <- substr(GE30$file, 5, 7)
GE30$colony <- as.numeric(GE30$colony)
GE30 <- data.frame(GE30, row.names = 1) # move column 1 to row names
GE30 <- GE30[GE30$colony < 31, c(xyNames, "colony")]
nrow(GE30) # number of wings[1] 1049
# number of workers per colony
nColony <- table(GE30$colony)
nColony <- as.data.frame(nColony)
summary(nColony$Freq) Min. 1st Qu. Median Mean 3rd Qu. Max.
30.0 34.2 36.0 35.0 36.0 36.0
# Procrustes distances between workers in 30 colonies
pdVec <- numeric(30)
for (i in 1:30) {
coords <- arrayspecs(GE30[GE30$colony == i, xyNames], p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
D <- Y.gpa$procD
pdVec[i] <- mean(D)
}
summary(pdVec) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0211 0.0220 0.0228 0.0231 0.0239 0.0283
# Procrustes distance between colonies
GE30colony <- aggregate(GE30[, xyNames], by = list(GE30$colony), FUN = mean)
GE30colony <- data.frame(GE30colony, row.names = 1) # move column 1 to row names
coords <- arrayspecs(GE30colony, p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
# Pairwise Procrustes distances among specimens
D <- Y.gpa$procD
summary(D) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00696 0.01153 0.01379 0.01411 0.01617 0.02600
# Procrustes distance between replications
coords <- aggregate(wingsAdjAligned[, xyNames], by = list(wingsAdjAligned$replication),
FUN = mean)
coords <- data.frame(coords, row.names = 1) # move column 1 to row names
coords <- arrayspecs(coords, p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
D <- Y.gpa$procD
summary(D) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00420 0.00491 0.00562 0.00532 0.00588 0.00614
# Procrustes distance between countries
coords <- aggregate(wingsAligned, by = list(wings$country), FUN = mean)
coords <- data.frame(coords, row.names = 1) # move column 1 to row names
coords <- arrayspecs(coords, p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
D <- Y.gpa$procD
knitr::kable(as.data.frame(as.matrix(D)), digits = 4)| Azerbaijan | Cyprus | Georgia | Iran | Iraq | Kazakhstan | Saudi Arabia | Tajikistan | Turkey | |
|---|---|---|---|---|---|---|---|---|---|
| Azerbaijan | 0.0000 | 0.0154 | 0.0195 | 0.0161 | 0.0108 | 0.0174 | 0.0291 | 0.0149 | 0.0145 |
| Cyprus | 0.0154 | 0.0000 | 0.0182 | 0.0133 | 0.0170 | 0.0173 | 0.0235 | 0.0180 | 0.0132 |
| Georgia | 0.0195 | 0.0182 | 0.0000 | 0.0205 | 0.0189 | 0.0117 | 0.0229 | 0.0155 | 0.0118 |
| Iran | 0.0161 | 0.0133 | 0.0205 | 0.0000 | 0.0164 | 0.0151 | 0.0277 | 0.0128 | 0.0185 |
| Iraq | 0.0108 | 0.0170 | 0.0189 | 0.0164 | 0.0000 | 0.0157 | 0.0321 | 0.0128 | 0.0162 |
| Kazakhstan | 0.0174 | 0.0173 | 0.0117 | 0.0151 | 0.0157 | 0.0000 | 0.0236 | 0.0095 | 0.0127 |
| Saudi Arabia | 0.0291 | 0.0235 | 0.0229 | 0.0277 | 0.0321 | 0.0236 | 0.0000 | 0.0284 | 0.0215 |
| Tajikistan | 0.0149 | 0.0180 | 0.0155 | 0.0128 | 0.0128 | 0.0095 | 0.0284 | 0.0000 | 0.0163 |
| Turkey | 0.0145 | 0.0132 | 0.0118 | 0.0185 | 0.0162 | 0.0127 | 0.0215 | 0.0163 | 0.0000 |
# D
summary(D) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00954 0.01419 0.01635 0.01775 0.01976 0.03213
Influence of sample size on classification results
# single colony
colony <- wingsAligned[geo.data$sample == "GE-0001", xyNames]
# for each sample size create 36 replicates equal to number of workers in the
# colony
nrep <- nrow(colony)
nVec <- 2^(0:5) # vector with sample size ranging from 1 to 32
plotList <- vector(mode = "list", length = length(nVec))
classDf <- data.frame(matrix(ncol = length(nVec), nrow = nrep))
OpDf <- data.frame(matrix(ncol = length(nVec), nrow = nrep))
set.seed(1)
for (i in 1:length(nVec)) {
n <- nVec[i]
replicates <- replicate(nrep, colMeans(colony[sample(1:nrow(colony), n, replace = FALSE),
, drop = FALSE]))
replicates <- t(replicates)
id <- gmLdaData2id(idData, replicates, average = FALSE)
plotList[[i]] <- id$plot # add plot to list
classDf[, i] <- id$id$group
OpDf[, i] <- id$P[, 4]
}
plotLabels <- letters[1:6]
plotLabels <- paste(plotLabels, nVec, sep = ", n=")
ggarrange(plotlist = plotList, labels = plotLabels, font.label = list(size = 12,
face = "bold"), ncol = 2, nrow = 3)classDf X1 X2 X3 X4 X5 X6
1 O O O O O O
2 O O O O O O
3 O A O O O O
4 A O O O O O
5 O O O O O O
6 O O O O O O
7 O A O O O O
8 O O O O O O
9 O O O O O O
10 O O O O O O
11 O O O O O O
12 O O O O O O
13 O O O O O O
14 O O O O O O
15 O O O O O O
16 O O O O O O
17 O O O O O O
18 O O O O O O
19 O O O O O O
20 O O O O O O
21 O O O O O O
22 O O O O O O
23 O O O O O O
24 O O O O O O
25 O O O O O O
26 O O O O O O
27 A O O O O O
28 O O O O O O
29 O O O O O O
30 O O O O O O
31 O O O O O O
32 O O O O O O
33 O O O O O O
34 O O O O O O
35 O O O O O O
36 O O O O O O
References
Machlowska, J., Kandemir, I., Özkan Koca, A., Kakhniashvili, V., Alattal, Y., Alghamdi, A., & Tofilski, A. (2025). Fore wings of honey bees (Apis mellifera) from southwestern Asia [Data set]. Zenodo. https://doi.org/10.5281/zenodo.17075125
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336
Temirbayeva, K., Torekhanov, A., Nuralieva, U., Sheralieva, Z., & Tofilski, A. (2023). In Search of Apis mellifera pomonella in Kazakhstan. Life, 13(9), 1860. https://doi.org/10.3390/life13091860
Temirbayeva, K., Torekhanov, A., Nuralieva, U., Sheralieva, Z., & Tofilski, A. (2023). Fore wings of honey bees (Apis mellifera) from Kazakhstan [Data set]. Zenodo. https://doi.org/10.5281/zenodo.8128010
Session information
sessionInfo()R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22000)
Matrix products: default
LAPACK version 3.12.1
locale:
[1] LC_COLLATE=en_US.UTF-8 LC_CTYPE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[4] LC_NUMERIC=C LC_TIME=en_US.UTF-8
time zone: Europe/Warsaw
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggpubr_0.6.2 sf_1.0-21 raster_3.6-32
[4] sp_2.2-0 rnaturalearth_1.1.0 ggplot2_4.0.0
[7] stringr_1.6.0 IdentiFlyR_0.1.1 reshape2_1.4.4
[10] Morpho_2.13 geomorph_4.0.10 Matrix_1.7-3
[13] rgl_1.3.24 RRPP_2.1.2
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 dplyr_1.1.4 farver_2.1.2
[4] S7_0.2.0 fastmap_1.2.0 tweenr_2.0.3
[7] digest_0.6.37 lifecycle_1.0.4 survival_3.8-3
[10] terra_1.8-80 bezier_1.1.2 magrittr_2.0.4
[13] compiler_4.5.0 rlang_1.1.6 tools_4.5.0
[16] yaml_2.3.10 knitr_1.50 ggsignif_0.6.4
[19] fitdistrplus_1.2-4 labeling_0.4.3 htmlwidgets_1.6.4
[22] scatterplot3d_0.3-44 curl_7.0.0 mclust_6.1.2
[25] classInt_0.4-11 xml2_1.4.1 plyr_1.8.9
[28] RColorBrewer_1.1-3 shapes_1.2.8 abind_1.4-8
[31] KernSmooth_2.23-26 withr_3.0.2 purrr_1.2.0
[34] polyclip_1.10-7 grid_4.5.0 e1071_1.7-16
[37] scales_1.4.0 iterators_1.0.14 MASS_7.3-65
[40] cli_3.6.5 rmarkdown_2.30 generics_0.1.4
[43] rstudioapi_0.17.1 Rvcg_0.25 ggforce_0.5.0
[46] DBI_1.2.3 ape_5.8-1 proxy_0.4-27
[49] splines_4.5.0 parallel_4.5.0 formatR_1.14
[52] s2_1.1.9 base64enc_0.1-3 vctrs_0.6.5
[55] minpack.lm_1.2-4 jsonlite_2.0.0 carData_3.0-5
[58] car_3.1-3 ggrepel_0.9.6 rstatix_0.7.3
[61] Formula_1.2-5 jpeg_0.1-11 foreach_1.5.2
[64] tidyr_1.3.1 units_1.0-0 colorRamps_2.3.4
[67] glue_1.8.0 codetools_0.2-20 cowplot_1.2.0
[70] stringi_1.8.7 gtable_0.3.6 tibble_3.3.0
[73] pillar_1.11.1 htmltools_0.5.8.1 rnaturalearthdata_1.0.0
[76] R6_2.6.1 wk_0.9.4 doParallel_1.0.17
[79] evaluate_1.0.5 lattice_0.22-6 backports_1.5.0
[82] broom_1.0.10 class_7.3-23 Rcpp_1.1.0
[85] gridExtra_2.3 nlme_3.1-168 xfun_0.54.1
[88] pkgconfig_2.0.3