Analysis of honey bee wing shape in southwestern Asia

https://github.com/DrawWing/Apis-mellifera-Asia

Authors
Affiliations

Julita Machlowska

University of Agriculture in Krakow, Poland

Irfan Kandemir

Ankara University, Türkiye

Ayça Özkan Koca

Maltepe University, Türkiye

Vakhtang Kakhniashvili

Georgian Beekeepers Union, Georgia

Yehya Alattal

King Saud University, Saudi Arabia

Ahmad Alghamdi

King Saud University, Saudi Arabia

Adam Tofilski

University of Agriculture in Krakow, Poland

Published

2025-09-08

Libraries

# calculations
library(geomorph)  # GPA 
library(Morpho)  # CVA
library(reshape2)  # melt
library(IdentiFlyR)  # classification
library(stringr)  #str_split_fixed

# plotting and visualization
library(ggplot2)  # plots
ggplot2::theme_set(theme_light())
library(rnaturalearth)  # maps
library(raster)  # raster
library(sf)  # st_bbox 
library(ggpubr)  # ggarange

Variable names

p <- 19  # number of landmarks
k <- 2  # number of dimensions, in this case 2 for coordinates (x, y)

# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
  xyNames <- c(xyNames, paste0("x", i))
  xyNames <- c(xyNames, paste0("y", i))
}
xyNames
 [1] "x1"  "y1"  "x2"  "y2"  "x3"  "y3"  "x4"  "y4"  "x5"  "y5"  "x6"  "y6" 
[13] "x7"  "y7"  "x8"  "y8"  "x9"  "y9"  "x10" "y10" "x11" "y11" "x12" "y12"
[25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
[37] "x19" "y19"
# The number of principal components used is 2*p-4 = 34, which is equal to the
# degrees of freedom create principal components names used by prcomp
pcNames <- paste0("PC", 1:(2 * p - 4))
pcNames
 [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"
[11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
[21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
[31] "PC31" "PC32" "PC33" "PC34"
# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14,
  15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13,
  18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)

Read data

Read raw coordinates of 19 landmarks from Zenodo (Machlowska et al., 2025) Data for Kazakhstan refer to previous publication Temirbayeva et al., (2023a) and Temirbayeva et al., (2023b)

wings <- read.csv("https://zenodo.org/records/17075125/files/AZ-raw-coordinates.csv")
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/CY-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/GE-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/IQ-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/IR-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/SA-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/TJ-raw-coordinates.csv"))
wings <- rbind(wings, read.csv("https://zenodo.org/records/17075125/files/TR-raw-coordinates.csv"))

geo.data <- read.csv("https://zenodo.org/records/17075125/files/AZ-data.csv")
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/CY-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/GE-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/IQ-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/IR-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/SA-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/TJ-data.csv"))
geo.data <- rbind(geo.data, read.csv("https://zenodo.org/records/17075125/files/TR-data.csv"))

KZ.geo.data <- read.csv("https://zenodo.org/record/8128010/files/KZ-data.csv", strip.white = TRUE)
colnames(KZ.geo.data)[colnames(KZ.geo.data) == "group"] <- "notes"
KZ.wings <- read.csv("https://zenodo.org/record/8128010/files/KZ-raw-coordinates.csv")
# use only samples declared as 'local bees'
KZ.wings <- subset(KZ.wings, KZ.geo.data$notes == "local bee")
wings <- rbind(wings, KZ.wings)
KZ.geo.data <- subset(KZ.geo.data, notes == "local bee")
geo.data <- rbind(geo.data, KZ.geo.data)

# extract classifiers
wings$sample <- substr(wings$file, 1, 7)
wings$country <- substr(wings$file, 1, 2)
wings$country <- gsub("AZ", "Azerbaijan", wings$country)
wings$country <- gsub("CY", "Cyprus", wings$country)
wings$country <- gsub("GE", "Georgia", wings$country)
wings$country <- gsub("IQ", "Iraq", wings$country)
wings$country <- gsub("IR", "Iran", wings$country)
wings$country <- gsub("KZ", "Kazakhstan", wings$country)
wings$country <- gsub("SA", "Saudi Arabia", wings$country)
wings$country <- gsub("TJ", "Tajikistan", wings$country)
wings$country <- gsub("TR", "Turkey", wings$country)

# Aggregate geographic data by sample
geoDataSample <- aggregate(geo.data[c("latitude", "longitude")], by = list(wings$sample),
  FUN = mean)
geoDataSample <- data.frame(geoDataSample, row.names = 1)  # move column 1 to row names
geoDataSample$country <- substr(rownames(geoDataSample), 1, 2)
geoDataSample$country <- gsub("AZ", "Azerbaijan", geoDataSample$country)
geoDataSample$country <- gsub("CY", "Cyprus", geoDataSample$country)
geoDataSample$country <- gsub("GE", "Georgia", geoDataSample$country)
geoDataSample$country <- gsub("IQ", "Iraq", geoDataSample$country)
geoDataSample$country <- gsub("IR", "Iran", geoDataSample$country)
geoDataSample$country <- gsub("KZ", "Kazakhstan", geoDataSample$country)
geoDataSample$country <- gsub("SA", "Saudi Arabia", geoDataSample$country)
geoDataSample$country <- gsub("TJ", "Tajikistan", geoDataSample$country)
geoDataSample$country <- gsub("TR", "Turkey", geoDataSample$country)

nrow(wings)  # number of wings
[1] 17457
table(wings$country)  # number of wings per country

  Azerbaijan       Cyprus      Georgia         Iran         Iraq   Kazakhstan 
         114          654         3579         3399           91          442 
Saudi Arabia   Tajikistan       Turkey 
        1397           88         7693 
nrow(geoDataSample)  # number of samples
[1] 1557
table(geoDataSample$country)  # number of samples per country

  Azerbaijan       Cyprus      Georgia         Iran         Iraq   Kazakhstan 
          14           73          102          401           11           29 
Saudi Arabia   Tajikistan       Turkey 
         127            5          795 
# number of workers per sample
nColony <- table(wings$sample)
nColony <- data.frame(nColony, row.names = 1)  # move column 1 to row names
ggplot(nColony, aes(x = Freq)) + geom_histogram() + coord_cartesian(xlim = c(0, 40))

summary(nColony$Freq)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    4.0     8.0     9.0    11.2    10.0    40.0 
# number of samples per location
location <- paste(round(geoDataSample$latitude, 1), round(geoDataSample$longitude,
  1))
nLocation <- table(location)
nLocation <- as.data.frame(nLocation)

locationLatLon <- str_split_fixed(nLocation$location, " ", 2)
colnames(locationLatLon) <- c("latitude", "longitude")
locationLatLon <- apply(locationLatLon, 2, as.numeric)
locationLatLon <- as.data.frame(locationLatLon)
nLocation <- cbind(nLocation, locationLatLon)

Maps

# Read elevation data.  In order to download the TIF file uncomment the two
# lines below.
# download.file('https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip',
# 'wc2.1_2.5m_elev.zip'); unzip('wc2.1_2.5m_elev.zip'); Or use your local file.
elev <- raster("D:/WorldClim/wc2.1_2.5m_elev.tif")

elev.color <- colorRampPalette(c("#f7f7f7", "#d9d9d9", "#bdbdbd", "#636363", "#303030"),
  bias = 2)

x.min <- 25
x.max <- 83
y.min <- 15
y.max <- 46
e <- extent(x.min - 5, x.max + 5, y.min - 5, y.max + 5)
elev <- crop(elev, e)
elev.df <- data.frame(rasterToPoints(elev))
colnames(elev.df) <- c("longitude", "latitude", "altitude")

world <- ne_countries(scale = "medium", returnclass = "sf")
# Extract Caspian See outline
coastline <- ne_coastline(scale = "medium", returnclass = "sf")
bbox <- st_bbox(c(xmin = 46, xmax = 56, ymin = 36, ymax = 47))
caspian <- st_crop(coastline, bbox)
caspian$geometry <- sf::st_cast(caspian$geometry, "POLYGON")  # convert to polygon

ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
  geom_sf(data = caspian, fill = "white") + scale_fill_gradientn(colours = elev.color(100)) +
  geom_sf(fill = NA) + coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) +
  geom_jitter(data = geoDataSample, aes(x = longitude, y = latitude, color = country,
    shape = country), size = 0.7, width = 0.3, height = 0.3) + scale_color_manual(name = "country",
  values = rainbow(9)) + scale_shape_manual(name = "country", values = 1:9) + theme(legend.position = "right")

ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
  geom_sf(data = caspian, fill = "white") + scale_fill_gradientn(colours = elev.color(100)) +
  geom_sf(fill = NA) + coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) +
  geom_point(data = nLocation, aes(x = longitude, y = latitude, size = Freq)) +
  scale_size_binned_area(name = "colonies", limits = c(0, 100), breaks = c(0, 5,
    10, 20, 40)) + theme(legend.position = "right")

GPA-alignment

# Convert from 2D array to 3D array
wings3D <- arrayspecs(wings[xyNames], p, k)
dimnames(wings3D)[[3]] <- wings$file

# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wings3D, print.progress = FALSE)

# plot landmarks after alignment
plotAllSpecimens(GPA$coords, links = links.apis, label = TRUE, plot_param = list(pt.bg = "black",
  pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))

# Convert from 3D array to 2D array
wingsAligned <- two.d.array(GPA$coords)
colnames(wingsAligned) <- xyNames

samplesAligned <- aggregate(wingsAligned, by = list(wings$sample), FUN = mean)
samplesAligned <- data.frame(samplesAligned, row.names = 1)  # move column 1 to row names

PCA colony

PCA <- prcomp(samplesAligned[, xyNames])
summary(PCA)
Importance of components:
                          PC1     PC2     PC3     PC4     PC5     PC6    PC7
Standard deviation     0.0103 0.00784 0.00686 0.00503 0.00414 0.00376 0.0035
Proportion of Variance 0.2851 0.16607 0.12706 0.06836 0.04620 0.03810 0.0331
Cumulative Proportion  0.2851 0.45117 0.57823 0.64659 0.69278 0.73089 0.7640
                           PC8     PC9    PC10    PC11   PC12    PC13    PC14
Standard deviation     0.00347 0.00302 0.00286 0.00273 0.0027 0.00228 0.00219
Proportion of Variance 0.03247 0.02457 0.02212 0.02008 0.0197 0.01401 0.01293
Cumulative Proportion  0.79646 0.82103 0.84315 0.86324 0.8829 0.89689 0.90982
                          PC15    PC16    PC17    PC18    PC19    PC20    PC21
Standard deviation     0.00209 0.00201 0.00198 0.00173 0.00159 0.00142 0.00138
Proportion of Variance 0.01180 0.01087 0.01056 0.00811 0.00686 0.00543 0.00511
Cumulative Proportion  0.92162 0.93249 0.94306 0.95116 0.95802 0.96345 0.96856
                          PC22    PC23    PC24    PC25    PC26     PC27
Standard deviation     0.00134 0.00121 0.00111 0.00109 0.00105 0.000918
Proportion of Variance 0.00484 0.00393 0.00333 0.00318 0.00300 0.002280
Cumulative Proportion  0.97341 0.97733 0.98066 0.98384 0.98684 0.989120
                           PC28     PC29     PC30     PC31     PC32     PC33
Standard deviation     0.000906 0.000859 0.000835 0.000795 0.000682 0.000605
Proportion of Variance 0.002220 0.001990 0.001880 0.001710 0.001260 0.000990
Cumulative Proportion  0.991330 0.993320 0.995210 0.996920 0.998170 0.999160
                           PC34     PC35     PC36     PC37     PC38
Standard deviation     0.000558 4.43e-17 2.53e-17 1.88e-17 1.08e-17
Proportion of Variance 0.000840 0.00e+00 0.00e+00 0.00e+00 0.00e+00
Cumulative Proportion  1.000000 1.00e+00 1.00e+00 1.00e+00 1.00e+00
PCscores <- as.data.frame(PCA$x)
PCscores <- cbind(PCscores, geoDataSample)

# create plot labels
variance.tab <- summary(PCA)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")

figPca <- ggplot(PCscores, aes(x = PC1, y = PC2, shape = country, color = country)) +
  geom_point(size = 0.7) + scale_shape_manual(name = "country", values = 1:9) +
  scale_color_manual(name = "country", values = rainbow(9)) + stat_ellipse() +
  xlab(label.x) + ylab(label.y)
figPca

MANOVA

MANOVA <- manova(as.matrix(cbind(PCscores[, pcNames])) ~ country, PCscores)
summary(MANOVA)
            Df Pillai approx F num Df den Df Pr(>F)    
country      8   3.29     31.3    272  12176 <2e-16 ***
Residuals 1548                                         
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

LDA

# use equal prior probability for all groups
n.gr <- length(unique(PCscores$country))  # number of groups
LDA <- CVA(PCscores[pcNames], PCscores$country, rounds = 10000, cv = TRUE, prior = rep(1/n.gr,
  n.gr))
LDscores <- as.data.frame(LDA$CVscores)
# rename variable names from CV to LD otherwise use `CV 1`
colnames(LDscores) <- gsub("CV ", "LD", colnames(LDscores))
LDscores$country <- geoDataSample$country

figLda <- ggplot(LDscores, aes(x = LD1, y = LD2, shape = country, color = country)) +
  geom_point(size = 0.7) + scale_shape_manual(name = "country", values = 1:9) +
  scale_color_manual(name = "country", values = rainbow(9)) + stat_ellipse()
figLda

ggarrange(figPca, figLda, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
  ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")

# Classification of the samples to groups
LDclass <- typprobClass(LDA$CVscores, groups = as.factor(LDscores$country), outlier = 0)
print(LDclass)
 cross-validated classification results in frequencies
              
               Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan Saudi Arabia
  Azerbaijan           14      0       0    0    0          0            0
  Cyprus                1     68       0    0    2          0            0
  Georgia               0      0     102    0    0          0            0
  Iran                  0      0       0  398    0          2            0
  Iraq                  0      0       0    0   11          0            0
  Kazakhstan            0      0       1    0    0         26            0
  Saudi Arabia          0      0       0    0    0          0          127
  Tajikistan            0      0       0    0    0          0            0
  Turkey                5      8       8    2    5         12            0
              
               Tajikistan Turkey
  Azerbaijan            0      0
  Cyprus                0      2
  Georgia               0      0
  Iran                  1      0
  Iraq                  0      0
  Kazakhstan            2      0
  Saudi Arabia          0      0
  Tajikistan            5      0
  Turkey                1    754


 cross-validated classification result in %
              
               Azerbaijan    Cyprus   Georgia      Iran      Iraq Kazakhstan
  Azerbaijan    100.00000   0.00000   0.00000   0.00000   0.00000    0.00000
  Cyprus          1.36986  93.15068   0.00000   0.00000   2.73973    0.00000
  Georgia         0.00000   0.00000 100.00000   0.00000   0.00000    0.00000
  Iran            0.00000   0.00000   0.00000  99.25187   0.00000    0.49875
  Iraq            0.00000   0.00000   0.00000   0.00000 100.00000    0.00000
  Kazakhstan      0.00000   0.00000   3.44828   0.00000   0.00000   89.65517
  Saudi Arabia    0.00000   0.00000   0.00000   0.00000   0.00000    0.00000
  Tajikistan      0.00000   0.00000   0.00000   0.00000   0.00000    0.00000
  Turkey          0.62893   1.00629   1.00629   0.25157   0.62893    1.50943
              
               Saudi Arabia Tajikistan    Turkey
  Azerbaijan        0.00000    0.00000   0.00000
  Cyprus            0.00000    0.00000   2.73973
  Georgia           0.00000    0.00000   0.00000
  Iran              0.00000    0.24938   0.00000
  Iraq              0.00000    0.00000   0.00000
  Kazakhstan        0.00000    6.89655   0.00000
  Saudi Arabia    100.00000    0.00000   0.00000
  Tajikistan        0.00000  100.00000   0.00000
  Turkey            0.00000    0.12579  94.84277


 overall classification accuracy: 96.66024 %

 Kappa statistic: 0.95027
# Mahalanobis distances between groups
knitr::kable(as.data.frame(as.matrix(LDA$Dist$GroupdistMaha)), digits = 3)
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan Saudi Arabia Tajikistan Turkey
Azerbaijan 0.00 5.16 7.71 6.67 4.62 7.76 10.60 7.38 4.93
Cyprus 5.16 0.00 7.62 6.53 5.74 7.57 9.94 7.89 4.91
Georgia 7.71 7.62 0.00 6.49 7.30 5.64 9.66 5.37 5.16
Iran 6.67 6.53 6.49 0.00 6.48 5.28 9.38 5.12 6.24
Iraq 4.62 5.74 7.30 6.48 0.00 6.92 11.34 7.06 5.31
Kazakhstan 7.76 7.57 5.64 5.28 6.92 0.00 9.86 4.03 5.65
Saudi Arabia 10.60 9.94 9.66 9.38 11.34 9.86 0.00 9.66 8.91
Tajikistan 7.38 7.89 5.37 5.12 7.06 4.03 9.66 0.00 6.40
Turkey 4.93 4.91 5.16 6.24 5.31 5.65 8.91 6.40 0.00
# Probabilities of differences between groups
knitr::kable(as.data.frame(as.matrix(LDA$Dist$probsMaha)), digits = 5)
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan Saudi Arabia Tajikistan Turkey
Azerbaijan 0.0000 1e-04 0.0001 0.0001 0.0015 0.0001 1e-04 0.0002 1e-04
Cyprus 0.0001 0e+00 0.0001 0.0001 0.0001 0.0001 1e-04 0.0001 1e-04
Georgia 0.0001 1e-04 0.0000 0.0001 0.0001 0.0001 1e-04 0.0031 1e-04
Iran 0.0001 1e-04 0.0001 0.0000 0.0001 0.0001 1e-04 0.0044 1e-04
Iraq 0.0015 1e-04 0.0001 0.0001 0.0000 0.0001 1e-04 0.0002 1e-04
Kazakhstan 0.0001 1e-04 0.0001 0.0001 0.0001 0.0000 1e-04 0.1117 1e-04
Saudi Arabia 0.0001 1e-04 0.0001 0.0001 0.0001 0.0001 0e+00 0.0001 1e-04
Tajikistan 0.0002 1e-04 0.0031 0.0044 0.0002 0.1117 1e-04 0.0000 5e-04
Turkey 0.0001 1e-04 0.0001 0.0001 0.0001 0.0001 1e-04 0.0005 0e+00

Classification of colonies as lineages

Comparison with lineages dataset Nawrocka et al., (2018a) and Nawrocka et al., (2018b)

idData <- xml2gmLdaData("https://zenodo.org/records/14054009/files/apis-mellifera-lineage.dw.xml")
id <- gmLdaData2id(idData, samplesAligned[, xyNames], average = FALSE)
id$plot

tab <- table(id$id$group)
tab

  A   C   M   O 
383 260  85 829 
prop.table(tab)

     A      C      M      O 
0.2460 0.1670 0.0546 0.5324 
idID <- id$id
idID$country <- substr(rownames(idID), 1, 2)
tab <- table(idID$group, idID$country)
tab
   
     AZ  CY  GE  IQ  IR  KZ  SA  TJ  TR
  A   0   2   7   0 146   6 127   0  95
  C   2  43   0   1 107  11   0   3  93
  M   0   8   0   0  52   0   0   0  25
  O  12  20  95  10  96  12   0   2 582
prop.table(tab, margin = 2)
   
        AZ     CY     GE     IQ     IR     KZ     SA     TJ     TR
  A 0.0000 0.0274 0.0686 0.0000 0.3641 0.2069 1.0000 0.0000 0.1195
  C 0.1429 0.5890 0.0000 0.0909 0.2668 0.3793 0.0000 0.6000 0.1170
  M 0.0000 0.1096 0.0000 0.0000 0.1297 0.0000 0.0000 0.0000 0.0314
  O 0.8571 0.2740 0.9314 0.9091 0.2394 0.4138 0.0000 0.4000 0.7321
idMD <- id$MD2
pTab <- as.data.frame(id$P)
sums <- rowSums(pTab)
pTab <- pTab/sums

pTabLong <- pTab
pTabLong$sample <- rownames(pTabLong)
pTabLong <- melt(pTabLong, id.vars = c("sample"))
pTabLong$gr <- substr(pTabLong$sample, 1, 2)  # short 2 letter name

ggplot(pTabLong, aes(sample, value, fill = variable)) + geom_col(width = 1) + scale_fill_manual(values = rainbow(4),
  name = "") + facet_grid(~gr, switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = NULL, y = NULL) + scale_y_continuous(expand = c(0,
  0)) + guides(fill = guide_legend(nrow = 1, byrow = TRUE)) + theme(panel.spacing.x = unit(0.2,
  "lines"), axis.text.x = element_blank(), legend.position = "top")

geoDataSample$lineage <- idID$group

ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample, aes(x = longitude,
  y = latitude, color = lineage, shape = lineage), width = 0.4, height = 0.4, size = 0.7) +
  coord_sf(xlim = c(x.min, x.max), ylim = c(y.min, y.max)) + scale_color_manual(name = "",
  values = rainbow(4)) + scale_shape_manual(name = "", values = 1:4) + guides(color = guide_legend(nrow = 1,
  byrow = TRUE)) + theme(legend.position = "top")

# map of lineage A
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
  "A", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
  x.max), ylim = c(y.min, y.max))

# map of lineage C
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
  "C", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
  x.max), ylim = c(y.min, y.max))

# map of lineage M
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
  "M", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
  x.max), ylim = c(y.min, y.max))

# map of lineage O
ggplot(data = world) + geom_sf() + geom_jitter(data = geoDataSample[geoDataSample$lineage ==
  "O", ], aes(x = longitude, y = latitude), width = 0.4, height = 0.4) + coord_sf(xlim = c(x.min,
  x.max), ylim = c(y.min, y.max))

Error of measurements

wingsAdj <- read.csv("https://zenodo.org/records/17075125/files/GE-30-3x.csv", row.names = 1)
wingsAdj3D <- arrayspecs(wingsAdj[xyNames], p, k)
dimnames(wingsAdj3D)[[3]] <- rownames(wingsAdj)

# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wingsAdj3D, print.progress = FALSE)
wingsAdjAligned <- two.d.array(GPA$coords)
colnames(wingsAdjAligned) <- xyNames
wingsAdjAligned <- as.data.frame(wingsAdjAligned)
wingsAdjAligned$colony <- substr(rownames(wingsAdj), 5, 7)
wingsAdjAligned$colony <- as.numeric(wingsAdjAligned$colony)
wingsAdjAligned$replication <- str_sub(rownames(wingsAdj), -8, -8)
wingsAdjAligned$colony_rep <- paste(wingsAdjAligned$colony, wingsAdjAligned$replication,
  sep = "_")

samplesAdj <- aggregate(wingsAdjAligned[xyNames], by = list(wingsAdjAligned$colony_rep),
  FUN = mean)
samplesAdj <- data.frame(samplesAdj, row.names = 1)  # move column 1 to row names

colonyRep <- str_split_fixed(rownames(samplesAdj), "_", 2)
colnames(colonyRep) <- c("colony", "replication")
samplesAdj <- cbind(samplesAdj, colonyRep)
table(samplesAdj$replication)

 0  1  2 
30 30 30 
PCA <- prcomp(samplesAdj[, xyNames])
PCscores <- as.data.frame(PCA$x)

PCscores$colony <- samplesAdj$colony
PCscores$group <- samplesAdj$replication
PCscores$point <- PCscores$group

# create plot labels
variance.tab <- summary(PCA)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")

repMean <- aggregate(PCscores[, c("PC1", "PC2")], by = list(PCscores$colony), FUN = mean)
colnames(repMean)[1] <- "colony"
repMean$point <- "mean"
repMean$group <- 0
repMean1 <- repMean
repMean1$group <- 1
repMean2 <- repMean
repMean2$group <- 2

plotData <- rbind(PCscores[, c("PC1", "PC2", "colony", "group")], repMean[, c("PC1",
  "PC2", "colony", "group")], repMean1[, c("PC1", "PC2", "colony", "group")], repMean2[,
  c("PC1", "PC2", "colony", "group")])
plotData$colony_gr <- paste(plotData$colony, plotData$group, sep = "-")
figME1 <- ggplot(plotData, aes(x = PC1, y = PC2, color = group, shape = group)) +
  geom_point() + scale_color_manual(name = "replication", values = c(rainbow(3),
  "black")) + scale_shape_manual(name = "replication", values = c(16, 16, 16)) +
  geom_line(aes(group = colony_gr), color = "black") + xlab(label.x) + ylab(label.y) +
  theme(legend.position = "none")
figME1

# Convert from 2D array to 3D array
samplesAdj3D <- arrayspecs(samplesAdj[xyNames], p, k)
dimnames(samplesAdj3D)[[3]] <- rownames(samplesAdj)
wingsAdjList <- list(coordsarray = samplesAdj3D, colony = samplesAdj$colony, replication = samplesAdj$replication)

ME1 <- gm.measurement.error(coords = "coordsarray", subjects = "colony", replicates = "replication",
  data = wingsAdjList)
anova(ME1)

Analysis of Variance, using Residual Randomization
Permutation procedure: Randomization of null model residuals 
Number of permutations: 1000 
Estimation method: Ordinary Least Squares 
Sums of Squares and Cross-products: Type Within-subject II 
Effect sizes (Z) based on SNR distributions

              Df      SS       MS   Rsq EtaSq.ME   SNR    Z Pr(>SNR)    
Subjects      29 0.01039 0.000358 0.919          266.3 17.9    0.001 ***
Systematic ME  2 0.00087 0.000436 0.077    0.957  22.4 13.8    0.001 ***
Random ME     58 0.00004 0.000001 0.003    0.043                        
Total         89 0.01130                                                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call: Y ~ subjects + replicates
ICCstats(ME1, subjects = "Subjects", with_in = "Systematic ME")
              Df      SS       MS   Rsq EtaSq.ME   SNR    Z Pr(>SNR)    
Subjects      29 0.01039 0.000358 0.919          266.3 17.9    0.001 ***
Systematic ME  2 0.00087 0.000436 0.077    0.957  22.4 13.8    0.001 ***
Random ME     58 0.00004 0.000001 0.003    0.043                        
Total         89 0.01130                                                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ICC stats based on dispersion of values

  ICC ICC_a ICC_c 
0.883 0.887 0.994 
plot(ME1)

MEplot <- plot(ME1)
plotPoints <- MEplot$data
plotPoints$x <- MEplot$x
plotPoints$y <- MEplot$y

plotPoints$colony <- plotPoints$subjects
plotPoints$group <- plotPoints$replicates
plotPoints$point <- plotPoints$group

repMean <- aggregate(plotPoints[, c("x", "y")], by = list(plotPoints$colony), FUN = mean)
colnames(repMean)[1] <- "colony"
repMean$point <- "mean"
repMean$group <- 0
repMean1 <- repMean
repMean1$group <- 1
repMean2 <- repMean
repMean2$group <- 2

plotData <- rbind(plotPoints[, c("x", "y", "colony", "group", "point")], repMean[,
  c("x", "y", "colony", "group", "point")], repMean1[, c("x", "y", "colony", "group",
  "point")], repMean2[, c("x", "y", "colony", "group", "point")])
plotData$colony_gr <- paste(plotData$colony, plotData$group, sep = "-")
figME2 <- ggplot(plotData, aes(x = x, y = y, color = point, shape = point)) + geom_point() +
  scale_color_manual(name = "replication", values = c(rainbow(3), "black")) + scale_shape_manual(name = "replication",
  values = c(16, 16, 16, 16)) + geom_line(aes(group = colony_gr), color = "black",
  linewidth = 0.5) + xlab(MEplot$xlab) + ylab(MEplot$ylab) + theme(legend.position = "none")
figME2

ggarrange(figME1, figME2, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
  ncol = 2, nrow = 1)

Procrustes distances

GE30 <- wings[wings$country == "Georgia", ]
GE30$colony <- substr(GE30$file, 5, 7)
GE30$colony <- as.numeric(GE30$colony)
GE30 <- data.frame(GE30, row.names = 1)  # move column 1 to row names
GE30 <- GE30[GE30$colony < 31, c(xyNames, "colony")]
nrow(GE30)  # number of wings
[1] 1049
# number of workers per colony
nColony <- table(GE30$colony)
nColony <- as.data.frame(nColony)
summary(nColony$Freq)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   30.0    34.2    36.0    35.0    36.0    36.0 
# Procrustes distances between workers in 30 colonies
pdVec <- numeric(30)
for (i in 1:30) {
  coords <- arrayspecs(GE30[GE30$colony == i, xyNames], p, k)
  Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
  D <- Y.gpa$procD
  pdVec[i] <- mean(D)
}
summary(pdVec)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0211  0.0220  0.0228  0.0231  0.0239  0.0283 
# Procrustes distance between colonies
GE30colony <- aggregate(GE30[, xyNames], by = list(GE30$colony), FUN = mean)
GE30colony <- data.frame(GE30colony, row.names = 1)  # move column 1 to row names
coords <- arrayspecs(GE30colony, p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
# Pairwise Procrustes distances among specimens
D <- Y.gpa$procD
summary(D)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00696 0.01153 0.01379 0.01411 0.01617 0.02600 
# Procrustes distance between replications
coords <- aggregate(wingsAdjAligned[, xyNames], by = list(wingsAdjAligned$replication),
  FUN = mean)
coords <- data.frame(coords, row.names = 1)  # move column 1 to row names
coords <- arrayspecs(coords, p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
D <- Y.gpa$procD
summary(D)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00420 0.00491 0.00562 0.00532 0.00588 0.00614 
# Procrustes distance between countries
coords <- aggregate(wingsAligned, by = list(wings$country), FUN = mean)
coords <- data.frame(coords, row.names = 1)  # move column 1 to row names
coords <- arrayspecs(coords, p, k)
Y.gpa <- gpagen(coords, Proj = TRUE, verbose = TRUE, print.progress = FALSE)
D <- Y.gpa$procD
knitr::kable(as.data.frame(as.matrix(D)), digits = 4)
Azerbaijan Cyprus Georgia Iran Iraq Kazakhstan Saudi Arabia Tajikistan Turkey
Azerbaijan 0.0000 0.0154 0.0195 0.0161 0.0108 0.0174 0.0291 0.0149 0.0145
Cyprus 0.0154 0.0000 0.0182 0.0133 0.0170 0.0173 0.0235 0.0180 0.0132
Georgia 0.0195 0.0182 0.0000 0.0205 0.0189 0.0117 0.0229 0.0155 0.0118
Iran 0.0161 0.0133 0.0205 0.0000 0.0164 0.0151 0.0277 0.0128 0.0185
Iraq 0.0108 0.0170 0.0189 0.0164 0.0000 0.0157 0.0321 0.0128 0.0162
Kazakhstan 0.0174 0.0173 0.0117 0.0151 0.0157 0.0000 0.0236 0.0095 0.0127
Saudi Arabia 0.0291 0.0235 0.0229 0.0277 0.0321 0.0236 0.0000 0.0284 0.0215
Tajikistan 0.0149 0.0180 0.0155 0.0128 0.0128 0.0095 0.0284 0.0000 0.0163
Turkey 0.0145 0.0132 0.0118 0.0185 0.0162 0.0127 0.0215 0.0163 0.0000
# D
summary(D)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00954 0.01419 0.01635 0.01775 0.01976 0.03213 

Influence of sample size on classification results

# single colony
colony <- wingsAligned[geo.data$sample == "GE-0001", xyNames]

# for each sample size create 36 replicates equal to number of workers in the
# colony
nrep <- nrow(colony)
nVec <- 2^(0:5)  # vector with sample size ranging from 1 to 32
plotList <- vector(mode = "list", length = length(nVec))
classDf <- data.frame(matrix(ncol = length(nVec), nrow = nrep))
OpDf <- data.frame(matrix(ncol = length(nVec), nrow = nrep))
set.seed(1)
for (i in 1:length(nVec)) {
  n <- nVec[i]
  replicates <- replicate(nrep, colMeans(colony[sample(1:nrow(colony), n, replace = FALSE),
    , drop = FALSE]))
  replicates <- t(replicates)

  id <- gmLdaData2id(idData, replicates, average = FALSE)
  plotList[[i]] <- id$plot  # add plot to list
  classDf[, i] <- id$id$group
  OpDf[, i] <- id$P[, 4]
}

plotLabels <- letters[1:6]
plotLabels <- paste(plotLabels, nVec, sep = ",  n=")
ggarrange(plotlist = plotList, labels = plotLabels, font.label = list(size = 12,
  face = "bold"), ncol = 2, nrow = 3)

classDf
   X1 X2 X3 X4 X5 X6
1   O  O  O  O  O  O
2   O  O  O  O  O  O
3   O  A  O  O  O  O
4   A  O  O  O  O  O
5   O  O  O  O  O  O
6   O  O  O  O  O  O
7   O  A  O  O  O  O
8   O  O  O  O  O  O
9   O  O  O  O  O  O
10  O  O  O  O  O  O
11  O  O  O  O  O  O
12  O  O  O  O  O  O
13  O  O  O  O  O  O
14  O  O  O  O  O  O
15  O  O  O  O  O  O
16  O  O  O  O  O  O
17  O  O  O  O  O  O
18  O  O  O  O  O  O
19  O  O  O  O  O  O
20  O  O  O  O  O  O
21  O  O  O  O  O  O
22  O  O  O  O  O  O
23  O  O  O  O  O  O
24  O  O  O  O  O  O
25  O  O  O  O  O  O
26  O  O  O  O  O  O
27  A  O  O  O  O  O
28  O  O  O  O  O  O
29  O  O  O  O  O  O
30  O  O  O  O  O  O
31  O  O  O  O  O  O
32  O  O  O  O  O  O
33  O  O  O  O  O  O
34  O  O  O  O  O  O
35  O  O  O  O  O  O
36  O  O  O  O  O  O

References

Machlowska, J., Kandemir, I., Özkan Koca, A., Kakhniashvili, V., Alattal, Y., Alghamdi, A., & Tofilski, A. (2025). Fore wings of honey bees (Apis mellifera) from southwestern Asia [Data set]. Zenodo. https://doi.org/10.5281/zenodo.17075125

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336

Temirbayeva, K., Torekhanov, A., Nuralieva, U., Sheralieva, Z., & Tofilski, A. (2023). In Search of Apis mellifera pomonella in Kazakhstan. Life, 13(9), 1860. https://doi.org/10.3390/life13091860

Temirbayeva, K., Torekhanov, A., Nuralieva, U., Sheralieva, Z., & Tofilski, A. (2023). Fore wings of honey bees (Apis mellifera) from Kazakhstan [Data set]. Zenodo. https://doi.org/10.5281/zenodo.8128010

Session information

sessionInfo()
R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22000)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
[3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
[5] LC_TIME=Polish_Poland.utf8    

time zone: Europe/Warsaw
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggpubr_0.6.0        sf_1.0-21           raster_3.6-32      
 [4] sp_2.2-0            rnaturalearth_1.0.1 ggplot2_3.5.2      
 [7] stringr_1.5.1       IdentiFlyR_0.1.1    reshape2_1.4.4     
[10] Morpho_2.12         geomorph_4.0.10     Matrix_1.7-3       
[13] rgl_1.3.18          RRPP_2.1.2         

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1        dplyr_1.1.4             farver_2.1.2           
 [4] fastmap_1.2.0           tweenr_2.0.3            digest_0.6.37          
 [7] lifecycle_1.0.4         terra_1.8-50            bezier_1.1.2           
[10] magrittr_2.0.3          compiler_4.5.0          rlang_1.1.6            
[13] tools_4.5.0             yaml_2.3.10             knitr_1.50             
[16] ggsignif_0.6.4          labeling_0.4.3          htmlwidgets_1.6.4      
[19] scatterplot3d_0.3-44    classInt_0.4-11         curl_6.2.2             
[22] plyr_1.8.9              xml2_1.3.8              RColorBrewer_1.1-3     
[25] shapes_1.2.7            abind_1.4-8             KernSmooth_2.23-26     
[28] withr_3.0.2             purrr_1.0.4             polyclip_1.10-7        
[31] grid_4.5.0              e1071_1.7-16            scales_1.4.0           
[34] iterators_1.0.14        MASS_7.3-65             cli_3.6.5              
[37] rmarkdown_2.29          generics_0.1.4          rstudioapi_0.17.1      
[40] httr_1.4.7              Rvcg_0.25               ggforce_0.4.2          
[43] DBI_1.2.3               ape_5.8-1               proxy_0.4-27           
[46] parallel_4.5.0          formatR_1.14            s2_1.1.8               
[49] base64enc_0.1-3         vctrs_0.6.5             minpack.lm_1.2-4       
[52] jsonlite_2.0.0          carData_3.0-5           car_3.1-3              
[55] ggrepel_0.9.6           rstatix_0.7.2           Formula_1.2-5          
[58] jpeg_0.1-11             foreach_1.5.2           tidyr_1.3.1            
[61] units_0.8-7             colorRamps_2.3.4        glue_1.8.0             
[64] codetools_0.2-20        cowplot_1.1.3           stringi_1.8.7          
[67] gtable_0.3.6            tibble_3.2.1            pillar_1.10.2          
[70] htmltools_0.5.8.1       rnaturalearthdata_1.0.0 R6_2.6.1               
[73] wk_0.9.4                doParallel_1.0.17       evaluate_1.0.3         
[76] lattice_0.22-6          backports_1.5.0         broom_1.0.8            
[79] class_7.3-23            Rcpp_1.0.14             gridExtra_2.3          
[82] nlme_3.1-168            xfun_0.52               pkgconfig_2.0.3