Analysis of honey bee wing shape in northwestern Europe

https://github.com/DrawWing/Apis-mellifera-NW-Europe

Authors
Affiliations

Julita Machlowska

University of Agriculture in Krakow, Poland

Dariusz Gerula

National Institute of Horticulture Research, Poland

Andrzej Oleksa

Kazimierz Wielki University in Bydgoszcz, Poland

Henk Kok

Nederlandse Bijenhoudersvereniging, Netherlands

Grace P. McCormack

University of Galway, Ireland

Alexandra Valentine

University of Galway, Ireland

Lars Kirkerud

Norsk Brunbielag, Norway

Per Kryger

Aarhus University, Denmark

Laima Blažytė-Čereškienė

Nature Research Centre, Lithuania

Martin Hasselmann

University of Hohenheim, Germany

Teweldemedhn Gebretinsae Hailu

University of Hohenheim, Germany

Benjamin Rutschmann

University of Würzburg, Germany

Adam Tofilski

University of Agriculture in Krakow, Poland

Modified

2026-04-08

Libraries

# calculations
library(geomorph)  # GPA 
library(Morpho)  # LDA
library(stats)  # MANOVA
library(IdentiFlyR)  # classification
library(reshape2)  # melt
library(stringr)  # str_split_fixed
library(emmeans)  # emmeans

# plotting and visualization
library(ggplot2)  # plots
ggplot2::theme_set(theme_light())
library(rnaturalearth)  # maps
library(terra)  # raster maps
library(sf)  # st_transform
library(ggpubr)  # ggarange
library(ggforce)  # geom_arc_bar

Variable names

p <- 19  # number of landmarks
k <- 2  # number of dimensions, in this case 2 for coordinates (x, y)

# create coordinates names used by IdentiFly
nameXy <- c("x1", "y1")
for (i in 2:p) {
  nameXy <- c(nameXy, paste0("x", i))
  nameXy <- c(nameXy, paste0("y", i))
}
nameXy
 [1] "x1"  "y1"  "x2"  "y2"  "x3"  "y3"  "x4"  "y4"  "x5"  "y5"  "x6"  "y6" 
[13] "x7"  "y7"  "x8"  "y8"  "x9"  "y9"  "x10" "y10" "x11" "y11" "x12" "y12"
[25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
[37] "x19" "y19"
# The number of principal components used is 2*p-4 = 34, which is equal to the
# degrees of freedom
namePc <- paste0("PC", 1:(2 * p - 4))
namePc
 [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"
[11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
[21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
[31] "PC31" "PC32" "PC33" "PC34"
# define which landmarks are connected by lines in wireframe graph
linkX <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14,
  15, 16, 17)
linkY <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13,
  18, 15, 16, 17, 18)
linkApis <- cbind(linkX, linkY)

Read data

New data collected for this study are available at Zenodo (Machlowska et al., 2026). There are use also data published during earlier studies (Oleksa et al., 2022; Tofilski, 2025)

dataRaw <- read.csv("https://zenodo.org/records/18845767/files/BY-raw-coordinates.csv")
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/DE-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/ES_517_573-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/FR-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/GB-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/IE-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/LT-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/NL-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/NO-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/18845767/files/PL_254_922-raw-coordinates.csv"))

dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/7244070/files/ES-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/7244070/files/PL-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/7244070/files/PT-raw-coordinates.csv"))
dataRaw <- rbind(dataRaw, read.csv("https://zenodo.org/records/17621032/files/CU-raw-coordinates.csv"))

# Read geographic coordinates
dataGeo <- read.csv("https://zenodo.org/records/18845767/files/BY-data.csv")
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/DE-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/ES_517_573-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/FR-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/GB-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/IE-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/LT-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/NL-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/NO-data.csv"))
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/18845767/files/PL_254_922-data.csv"))
geoDataNew <- dataGeo

# number of workers per sample in new datasets
nColony <- table(geoDataNew$sample)
nColony <- data.frame(nColony, row.names = 1)  # move column 1 to row names
ggplot(nColony, aes(x = Freq)) + geom_histogram(binwidth = 1)

summary(nColony$Freq)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   12.00   21.00   21.64   28.00   69.00 
dataEU <- read.csv("https://zenodo.org/record/7244070/files/EU-geo-data.csv")
countryEU <- substr(dataEU$sample, 1, 2)
dataEU <- dataEU[countryEU %in% c("ES", "PL", "PT"), ]
dataEU$date <- as.numeric(substr(dataEU$date, 1, 4))
dataGeo <- rbind(dataGeo, dataEU)
dataGeo <- rbind(dataGeo, read.csv("https://zenodo.org/records/17621032/files/CU-data.csv"))

dataGeoSample <- aggregate(dataGeo[c("latitude", "longitude", "date")], by = list(dataGeo$sample),
  FUN = mean)
dataGeoSample <- data.frame(dataGeoSample, row.names = 1)  # move column 1 to row names

dataRaw$ISO <- substr(dataRaw$file, 1, 2)
dataGeoSample$ISO <- substr(rownames(dataGeoSample), 1, 2)
dataGeoSample$ISO <- as.factor(dataGeoSample$ISO)
dataGeoSample$country <- dataGeoSample$ISO
dataGeoSample$country <- gsub("BY", "Belarus", dataGeoSample$country)
dataGeoSample$country <- gsub("CU", "Cuba", dataGeoSample$country)
dataGeoSample$country <- gsub("DE", "Germany", dataGeoSample$country)
dataGeoSample$country <- gsub("ES", "Spain", dataGeoSample$country)
dataGeoSample$country <- gsub("FR", "France", dataGeoSample$country)
dataGeoSample$country <- gsub("GB", "UK", dataGeoSample$country)
dataGeoSample$country <- gsub("IE", "Ireland", dataGeoSample$country)
dataGeoSample$country <- gsub("LT", "Lithuania", dataGeoSample$country)
dataGeoSample$country <- gsub("NL", "Netherlands", dataGeoSample$country)
dataGeoSample$country <- gsub("NO", "Norway", dataGeoSample$country)
dataGeoSample$country <- gsub("PL", "Poland", dataGeoSample$country)
dataGeoSample$country <- gsub("PT", "Portugal", dataGeoSample$country)

# number of samples per location
location <- paste(round(dataGeoSample$latitude, 1), round(dataGeoSample$longitude,
  1))
nLocation <- table(location)
nLocation <- as.data.frame(nLocation)

locationLatLon <- str_split_fixed(nLocation$location, " ", 2)
colnames(locationLatLon) <- c("latitude", "longitude")
locationLatLon <- apply(locationLatLon, 2, as.numeric)
locationLatLon <- as.data.frame(locationLatLon)
nLocation <- cbind(nLocation, locationLatLon)


location <- paste(round(dataGeoSample$latitude, 1), round(dataGeoSample$longitude,
  1))
nLocation <- table(location)
nLocation <- as.data.frame(nLocation)

locationLatLon <- str_split_fixed(nLocation$location, " ", 2)
colnames(locationLatLon) <- c("latitude", "longitude")
locationLatLon <- apply(locationLatLon, 2, as.numeric)
locationLatLon <- as.data.frame(locationLatLon)
nLocation <- cbind(nLocation, locationLatLon)

# average data for country
dataGeoCountry <- aggregate(dataGeoSample[c("latitude", "longitude")], by = list(dataGeoSample$ISO),
  FUN = mean)
country <- aggregate(dataGeoSample[c("country")], by = list(dataGeoSample$ISO), FUN = unique)
dataGeoCountry <- merge(dataGeoCountry, country, by = "Group.1")

colnames(dataGeoCountry)[1] <- "ISO"

Map

# Read elevation data In order to download the TIF file uncomment the two lines
# below
# #download.file('https://secure-web.cisco.com/15MbNLDNcvKRdwHz0C8BuWIBGjqhNz-VGOCGhfT0ba5LztPXbyrwb6epoX-FTjajgD8ZFDd0GJRkhcWz3V-WwLn7u53iy9KDz55qixlb5sECZRIKC-2tSg0tc_ZY2hq9U0S10DGheC9vVDLKudF18BT9aItv7zRwYlAnC_SQ2ODKtG9EhyJxY4GV6kntF7KVOTyBn0eT9PmiW_EJvCBFchY50GZBD6Ul9wyCVarVhNMTz91dmugWBBEpPb0St3lArbpanzUMmb4xrnZtX1Ve4AW4V9Px-yU6hq3_xT-TnHlbpX-oILMRTq8lJlFRDLUMMzJdYdy_64T_9KQCiP1M9qTPrJjl3rcIo5lZJoHawqz1DUYYKxe-u4oyQK77hLQL4OaxSZXWylQf-rly0woYPMDTW0YixWICqzpMBof0YhX8/https%3A%2F%2Fgeodata.ucdavis.edu%2Fclimate%2Fworldclim%2F2_1%2Fbase%2Fwc2.1_2.5m_elev.zip',
# 'wc2.1_2.5m_elev.zip') unzip('wc2.1_2.5m_elev.zip')

# or use your local file
elev <- rast("D:/WorldClim/wc2.1_2.5m_elev.tif")

colorElev <- colorRampPalette(c("#f7f7f7", "#d9d9d9", "#bdbdbd", "#636363", "#303030"),
  bias = 2)

xMin <- -10
xMax <- 40
yMin <- 35
yMax <- 65

# Define extent and crop (terra)
e <- ext(xMin - 5, xMax + 5, yMin - 5, yMax + 5)
elev <- crop(elev, e)
elevDf <- as.data.frame(elev, xy = TRUE, na.rm = TRUE)
colnames(elevDf) <- c("longitude", "latitude", "altitude")

dataMap <- ne_countries(scale = "medium", returnclass = "sf")

n <- length(unique(dataGeoSample$country))

ggplot(data = dataMap) + geom_sf() + coord_sf(xlim = c(xMin, xMax), ylim = c(yMin,
  yMax)) + geom_jitter(data = dataGeoSample, aes(x = longitude, y = latitude, color = country,
  shape = country), width = 0.5, height = 0.5) + scale_shape_manual(name = "",
  values = 1:n) + scale_color_manual(name = "", values = rainbow(n))

# Figure 1
ggplot(data = dataMap) + geom_raster(data = elevDf, aes(x = longitude, y = latitude,
  fill = altitude)) + scale_fill_gradientn(colours = colorElev(100)) + geom_sf(fill = NA) +
  coord_sf(xlim = c(xMin, xMax), ylim = c(yMin, yMax)) + geom_point(data = nLocation,
  aes(x = longitude, y = latitude, size = Freq)) + scale_size_binned_area(name = "colonies",
  limits = c(0, 100), breaks = c(0, 5, 10, 20, 40)) + theme(legend.position = "right")

# only new samples
geoDataNewSample <- aggregate(geoDataNew[c("latitude", "longitude", "date")], by = list(geoDataNew$sample),
  FUN = mean)

geoDataNewSample <- data.frame(geoDataNewSample, row.names = 1)
geoDataNewSample$ISO <- substr(rownames(geoDataNewSample), 1, 2)

ggplot(data = dataMap) + geom_sf() + coord_sf(xlim = c(xMin, xMax), ylim = c(yMin,
  yMax)) + geom_jitter(data = geoDataNewSample, aes(x = longitude, y = latitude,
  color = ISO, shape = ISO), width = 0.5, height = 0.5) + scale_shape_manual(name = "",
  values = 1:n) + scale_color_manual(name = "", values = rainbow(n))

GPA alignment

# Convert from 2D array to 3D array
wing3D <- arrayspecs(dataRaw[, nameXy], p, k)
dimnames(wing3D)[[3]] <- dataRaw$file

# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wing3D, print.progress = FALSE)
# Convert from 3D array to 2D array
dataAligned <- two.d.array(GPA$coords)
# rename column names from 1.X, 1.Y to x1, y1 ...
colnames(dataAligned) <- gsub("([0-9]+)\\.([XY])", "\\L\\2\\1", colnames(dataAligned),
  perl = TRUE)

# Average coorinates within samples
dataRaw$sample <- substr(dataRaw$file, 1, 7)
dataSample <- aggregate(dataAligned, by = list(dataRaw$sample), FUN = mean)
dataSample <- data.frame(dataSample, row.names = 1)  # move column 1 to row names
dataSample <- merge(dataSample, dataGeoSample, by = 0)
dataSample <- data.frame(dataSample, row.names = 1)  # move column 1 to row names

Sample size

nrow(geoDataNew)  # number of new wings
[1] 29043
geoDataNew$ISO <- substr(geoDataNew$file, 1, 2)
table(geoDataNew$ISO)  # number of new wings per country

   BY    DE    ES    FR    GB    IE    LT    NL    NO    PL 
  171    60   919   164   425  3498  1358  3555  2274 16619 
nrow(geoDataNewSample)  # number of new samples
[1] 1342
table(geoDataNewSample$ISO)  # number of new samples per country

 BY  DE  ES  FR  GB  IE  LT  NL  NO  PL 
  5   4  57  11  29 304  76 125  84 647 
nrow(dataRaw)  # number of all wings
[1] 38970
table(dataRaw$ISO)  # number of all wings per country

   BY    CU    DE    ES    FR    GB    IE    LT    NL    NO    PL    PT 
  171   449    60  3482   164   425  3498  1358  3555  2274 22574   960 
nrow(dataGeoSample)  # number of all samples
[1] 2312
table(dataGeoSample$ISO)  # number of all samples per country

 BY  CU  DE  ES  FR  GB  IE  LT  NL  NO  PL  PT 
  5   9   4 573  11  29 304  76 125  84 900 192 

PCA

PCA <- prcomp(dataSample[, nameXy], scale. = TRUE)
PCscores <- as.data.frame(PCA$x[, namePc])
PCscores$country <- as.factor(dataSample$country)

# create plot labels
variance.tab <- summary(PCA)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
xLabel <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
yLabel <- paste0("PC2 (", variance, "%)")

n <- length(unique(PCscores$country))
figPca <- ggplot(PCscores, aes(x = PC1, y = PC2, shape = country, color = country)) +
  geom_point() + scale_shape_manual(name = "", values = c(1:n)) + scale_color_manual(name = "",
  values = rainbow(n)) + stat_ellipse() + xlab(xLabel) + ylab(yLabel)
figPca

LDA

# use equal prior probability for all groups
n.gr <- length(unique(PCscores$country))  # number of groups
lda <- CVA(PCscores[namePc], PCscores$country, rounds = 10000, cv = TRUE, prior = rep(1/n.gr,
  n.gr))
ldaScores <- as.data.frame(lda$CVscores)
colnames(ldaScores) <- gsub("CV ", "LD", colnames(ldaScores))
# ldaScores <- cbind(sample.dataGeo, ldaScores)
ldaScores$country <- PCscores$country
ldaScores$ISO <- dataGeoSample$ISO

figLda <- ggplot(ldaScores, aes(x = LD1, y = LD2, shape = country, color = country)) +
  geom_point() + scale_shape_manual(name = "", values = c(1:n)) + scale_color_manual(name = "",
  values = rainbow(n)) + stat_ellipse()
figLda

ggplot(ldaScores, aes(x = LD1, y = LD3, shape = country, color = country)) + geom_point() +
  scale_shape_manual(name = "", values = c(1:n)) + scale_color_manual(name = "",
  values = rainbow(n)) + stat_ellipse()

# Figure 3
ggarrange(figPca, figLda, labels = c("a", "b"), font.label = list(size = 12, face = "bold"),
  ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")

# Confusion matrix Classification of the dataSample to groups
ldaclass <- typprobClass(lda$CVscores, groups = ldaScores$ISO, outlier = 0)
print(ldaclass)
 cross-validated classification results in frequencies
    
      BY  CU  DE  ES  FR  GB  IE  LT  NL  NO  PL  PT
  BY   5   0   0   0   0   0   0   0   0   0   0   0
  CU   0   9   0   0   0   0   0   0   0   0   0   0
  DE   0   0   4   0   0   0   0   0   0   0   0   0
  ES   3   0   0 440   0   0   2   0   0   1   1 126
  FR   2   0   0   0   9   0   0   0   0   0   0   0
  GB   0   0   0   0   0  29   0   0   0   0   0   0
  IE   0   0   0   0   2   0 283  18   0   0   1   0
  LT   0   0   0   0   2   0   6  62   0   0   6   0
  NL   0   0   0   0   0   0   0   0 122   0   3   0
  NO   0   0   0   0   0   0   0   0   0  84   0   0
  PL  17   1   2   0   1   0   0   0   1  10 867   1
  PT   0   0   0  47   0   0   0   0   0   0   0 145


 cross-validated classification result in %
    
            BY        CU        DE        ES        FR        GB        IE
  BY 100.00000   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000
  CU   0.00000 100.00000   0.00000   0.00000   0.00000   0.00000   0.00000
  DE   0.00000   0.00000 100.00000   0.00000   0.00000   0.00000   0.00000
  ES   0.52356   0.00000   0.00000  76.78883   0.00000   0.00000   0.34904
  FR  18.18182   0.00000   0.00000   0.00000  81.81818   0.00000   0.00000
  GB   0.00000   0.00000   0.00000   0.00000   0.00000 100.00000   0.00000
  IE   0.00000   0.00000   0.00000   0.00000   0.65789   0.00000  93.09211
  LT   0.00000   0.00000   0.00000   0.00000   2.63158   0.00000   7.89474
  NL   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000
  NO   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000
  PL   1.88889   0.11111   0.22222   0.00000   0.11111   0.00000   0.00000
  PT   0.00000   0.00000   0.00000  24.47917   0.00000   0.00000   0.00000
    
            LT        NL        NO        PL        PT
  BY   0.00000   0.00000   0.00000   0.00000   0.00000
  CU   0.00000   0.00000   0.00000   0.00000   0.00000
  DE   0.00000   0.00000   0.00000   0.00000   0.00000
  ES   0.00000   0.00000   0.17452   0.17452  21.98953
  FR   0.00000   0.00000   0.00000   0.00000   0.00000
  GB   0.00000   0.00000   0.00000   0.00000   0.00000
  IE   5.92105   0.00000   0.00000   0.32895   0.00000
  LT  81.57895   0.00000   0.00000   7.89474   0.00000
  NL   0.00000  97.60000   0.00000   2.40000   0.00000
  NO   0.00000   0.00000 100.00000   0.00000   0.00000
  PL   0.00000   0.11111   1.11111  96.33333   0.11111
  PT   0.00000   0.00000   0.00000   0.00000  75.52083


 overall classification accuracy: 89.05709 %

 Kappa statistic: 0.8575
# Mahalanobis distances between groups
knitr::kable(as.data.frame(as.matrix(lda$Dist$GroupdistMaha)), digits = 3)
Belarus Cuba France Germany Ireland Lithuania Netherlands Norway Poland Portugal Spain UK
Belarus 0.000 6.678 6.397 6.699 6.473 7.036 6.756 5.493 4.738 7.210 6.836 8.376
Cuba 6.678 0.000 9.098 9.360 7.963 9.459 8.179 7.411 6.531 8.915 8.715 10.034
France 6.397 9.098 0.000 5.514 7.008 6.871 8.205 6.244 6.100 8.629 7.939 9.504
Germany 6.699 9.360 5.514 0.000 7.987 7.720 9.283 8.115 5.742 9.560 9.208 9.815
Ireland 6.473 7.963 7.008 7.987 0.000 3.778 8.752 6.579 5.137 7.205 6.616 8.580
Lithuania 7.036 9.459 6.871 7.720 3.778 0.000 9.361 7.585 6.338 8.249 7.679 9.190
Netherlands 6.756 8.179 8.205 9.283 8.752 9.361 0.000 6.684 6.553 8.067 8.113 9.289
Norway 5.493 7.411 6.244 8.115 6.579 7.585 6.684 0.000 5.049 7.199 6.908 7.212
Poland 4.738 6.531 6.100 5.742 5.137 6.338 6.553 5.049 0.000 6.447 6.332 7.289
Portugal 7.210 8.915 8.629 9.560 7.205 8.249 8.067 7.199 6.447 0.000 1.676 9.079
Spain 6.836 8.715 7.939 9.208 6.616 7.679 8.113 6.908 6.332 1.676 0.000 8.984
UK 8.376 10.034 9.504 9.815 8.580 9.190 9.289 7.212 7.289 9.079 8.984 0.000
# pairwise probabilities of differences between groups
knitr::kable(as.data.frame(as.matrix(lda$Dist$probsMaha)), digits = 5)
Belarus Cuba France Germany Ireland Lithuania Netherlands Norway Poland Portugal Spain UK
Belarus 0.0000 6e-04 0.0005 0.0134 1e-04 1e-04 1e-04 7e-04 0.0053 1e-04 1e-04 1e-04
Cuba 0.0006 0e+00 0.0001 0.0001 1e-04 1e-04 1e-04 1e-04 0.0001 1e-04 1e-04 1e-04
France 0.0005 1e-04 0.0000 0.0302 1e-04 1e-04 1e-04 1e-04 0.0001 1e-04 1e-04 1e-04
Germany 0.0134 1e-04 0.0302 0.0000 1e-04 1e-04 1e-04 1e-04 0.0014 1e-04 1e-04 1e-04
Ireland 0.0001 1e-04 0.0001 0.0001 0e+00 1e-04 1e-04 1e-04 0.0001 1e-04 1e-04 1e-04
Lithuania 0.0001 1e-04 0.0001 0.0001 1e-04 0e+00 1e-04 1e-04 0.0001 1e-04 1e-04 1e-04
Netherlands 0.0001 1e-04 0.0001 0.0001 1e-04 1e-04 0e+00 1e-04 0.0001 1e-04 1e-04 1e-04
Norway 0.0007 1e-04 0.0001 0.0001 1e-04 1e-04 1e-04 0e+00 0.0001 1e-04 1e-04 1e-04
Poland 0.0053 1e-04 0.0001 0.0014 1e-04 1e-04 1e-04 1e-04 0.0000 1e-04 1e-04 1e-04
Portugal 0.0001 1e-04 0.0001 0.0001 1e-04 1e-04 1e-04 1e-04 0.0001 0e+00 1e-04 1e-04
Spain 0.0001 1e-04 0.0001 0.0001 1e-04 1e-04 1e-04 1e-04 0.0001 1e-04 0e+00 1e-04
UK 0.0001 1e-04 0.0001 0.0001 1e-04 1e-04 1e-04 1e-04 0.0001 1e-04 1e-04 0e+00
# wing shape differs significantely between countries
MANOVA <- manova(as.matrix(cbind(PCscores[, namePc])) ~ country, PCscores)
summary(MANOVA)
            Df Pillai approx F num Df den Df    Pr(>F)    
country     11 3.7993   35.335    374  25047 < 2.2e-16 ***
Residuals 2300                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Classification as lineages

Comparison with lineages dataset (Nawrocka, Kandemir, Fuchs, & Tofilski, 2018a, 2018b)

idData <- xml2gmLdaData("https://zenodo.org/records/14054009/files/apis-mellifera-lineage.dw.xml")
idM <- gmLdaData2id(idData, dataSample[, nameXy], average = FALSE)
idM$plot

tab <- table(idM$id$group)
tab

   A    C    M    O 
 636  184 1473   19 
prop.table(tab)  # proportion of colonies classified as each of the lineages

          A           C           M           O 
0.275086505 0.079584775 0.637110727 0.008217993 
tab <- table(idM$id$group, dataSample$ISO)
tab
   
     BY  CU  DE  ES  FR  GB  IE  LT  NL  NO  PL  PT
  A   5   2   0  53   3  10  60  36  31  17 407  12
  C   0   0   4   0   6   0   1   5   2   0 166   0
  M   0   7   0 520   1  19 242  28  92  67 317 180
  O   0   0   0   0   1   0   1   7   0   0  10   0
round(prop.table(tab, margin = 2), digits = 3)
   
       BY    CU    DE    ES    FR    GB    IE    LT    NL    NO    PL    PT
  A 1.000 0.222 0.000 0.092 0.273 0.345 0.197 0.474 0.248 0.202 0.452 0.062
  C 0.000 0.000 1.000 0.000 0.545 0.000 0.003 0.066 0.016 0.000 0.184 0.000
  M 0.000 0.778 0.000 0.908 0.091 0.655 0.796 0.368 0.736 0.798 0.352 0.938
  O 0.000 0.000 0.000 0.000 0.091 0.000 0.003 0.092 0.000 0.000 0.011 0.000
pTab <- as.data.frame(idM$P)
sums <- rowSums(pTab)
pTab <- pTab/sums

pTabLong <- pTab
pTabLong$sample <- rownames(pTabLong)
pTabLong <- melt(pTabLong, id.vars = c("sample"))
pTabLong$gr <- substr(pTabLong$sample, 1, 2)  # short 2 letter name

ggplot(pTabLong, aes(sample, value, fill = variable)) + geom_col(width = 1) + scale_fill_manual(name = "classified as lineage",
  values = rainbow(4)) + facet_grid(~gr, switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = NULL, y = NULL) + scale_y_continuous(expand = c(0,
  0)) + guides(fill = guide_legend(nrow = 1, byrow = TRUE)) + theme(panel.spacing.x = unit(0.2,
  "lines"), axis.text.x = element_blank(), legend.position = "top")

countrySmallN <- c("Belarus", "Cuba", "Germany", "France", "UK")
# countries with large N
pTabLong <- pTab
pTabLong <- pTabLong[!dataSample$country %in% countrySmallN, ]
pTabLong$sample <- rownames(pTabLong)
pTabLong <- melt(pTabLong, id.vars = c("sample"))
pTabLong$gr <- substr(pTabLong$sample, 1, 2)  # short 2 letter name
figAdmA <- ggplot(pTabLong, aes(sample, value, fill = variable)) + geom_col(width = 1) +
  scale_fill_manual(name = "classified as lineage", values = rainbow(4)) + facet_grid(~gr,
  switch = "x", scales = "free", space = "free") + theme_minimal() + labs(x = NULL,
  y = NULL) + scale_y_continuous(expand = c(0, 0)) + guides(fill = guide_legend(nrow = 1,
  byrow = TRUE)) + theme(panel.spacing.x = unit(0.2, "lines"), axis.text.x = element_blank(),
  legend.position = "top")

# countries with small N
pTabLong <- pTab
pTabLong <- pTabLong[dataSample$country %in% countrySmallN, ]
pTabLong$sample <- rownames(pTabLong)
pTabLong <- melt(pTabLong, id.vars = c("sample"))
pTabLong$gr <- substr(pTabLong$sample, 1, 2)  # short 2 letter name
figAdmB <- ggplot(pTabLong, aes(sample, value, fill = variable)) + geom_col(width = 1) +
  scale_fill_manual(name = "classified as lineage", values = rainbow(4)) + facet_grid(~gr,
  switch = "x", scales = "free", space = "free") + theme_minimal() + labs(x = NULL,
  y = NULL) + scale_y_continuous(expand = c(0, 0)) + guides(fill = guide_legend(nrow = 1,
  byrow = TRUE)) + theme(panel.spacing.x = unit(0.2, "lines"), axis.text.x = element_blank(),
  legend.position = "top") + theme(axis.text.y = element_blank())

figAdm <- ggarrange(figAdmA, figAdmB, ncol = 2, nrow = 1, widths = c(0.66, 0.33),
  common.legend = TRUE, legend = "top")

dataGeoSample$lineage <- idM$id$group

ggplot(data = dataMap) + geom_sf() + geom_jitter(data = dataGeoSample, aes(x = longitude,
  y = latitude, color = lineage), width = 0.4, height = 0.4, size = 0.7) + coord_sf(xlim = c(xMin,
  xMax), ylim = c(yMin, yMax)) + scale_color_manual(name = "", values = rainbow(4))

tabProp <- prop.table(tab, margin = 2)
tabProp <- t(tabProp)
tabProp <- tabProp[-2, ]
countryLin <- as.data.frame(tabProp)
colnames(countryLin) <- c("ISO", "lineage", "Freq")
countryLin <- merge(countryLin, dataGeoCountry, by = "ISO")
countryLin$longitude[countryLin$ISO == "BY"] <- 30  # move Belarus plot
countryLin$ISO <- gsub("GB", "UK", countryLin$ISO)  # move Belarus plot

# Plot map with circular pie charts
dataMap_proj <- st_transform(dataMap, 3035)  # Europe projection

europe_bbox <- st_bbox(c(xmin = xMin, xmax = xMax, ymin = yMin, ymax = 70), crs = st_crs(4326))  # Crop to Europe
europe_bbox <- st_transform(st_as_sfc(europe_bbox), 3035)
dataMap_europe <- st_crop(dataMap_proj, europe_bbox)

# Convert and project pie centers
countryLin_sf <- st_as_sf(countryLin, coords = c("longitude", "latitude"), crs = 4326)
countryLin_proj <- st_transform(countryLin_sf, 3035)
coords <- st_coordinates(countryLin_proj)
countryLin_proj <- cbind(countryLin, coords)

# r: radius in meters (adjust size)
figMapClass <- ggplot() + geom_sf(data = dataMap_europe) + geom_arc_bar(data = countryLin_proj,
  aes(x0 = X, y0 = Y, r0 = 0, r = 150000, amount = Freq, fill = lineage), stat = "pie") +
  geom_text(data = countryLin_proj, aes(x = X, y = Y, label = ISO), vjust = 2.5) +
  scale_fill_manual(values = rainbow(4)) + coord_sf(expand = FALSE) + theme(legend.position = "none",
  axis.title.x = element_blank(), axis.title.y = element_blank())
figMapClass

# Figure 4
ggarrange(figAdm, figMapClass, labels = c("a", "b"), font.label = list(size = 12,
  face = "bold"), ncol = 1, nrow = 2, heights = c(0.5, 1.5))

Changes in time

dataId <- cbind(dataSample[, c("latitude", "longitude", "date", "country")], idM$MD2,
  idM$LD)
dataId$sample <- row.names(dataSample)
dataPl <- dataId[dataId$country == "Poland", ]
tmp <- str_split_fixed(dataPl$sample, "-", 2)
tmp <- as.numeric(tmp[, 2])
dataPl$sampleNr <- tmp

dataPl$years <- ifelse(dataPl$date < 2016, "before 2016", "after 2016")
dataPl$years <- factor(dataPl$years, levels = c("before 2016", "after 2016"))
dataPl$population <- ifelse((dataPl$sampleNr > 253 & dataPl$sampleNr < 680), "protected",
  "unprotected")
dataPl$population <- as.factor(dataPl$population)
table(dataPl$population, dataPl$years)
             
              before 2016 after 2016
  protected           315         91
  unprotected         241        253
fit <- lm(M ~ years * population, data = dataPl)
anova(fit)
Analysis of Variance Table

Response: M
                  Df  Sum Sq Mean Sq F value    Pr(>F)    
years              1 1027777 1027777 652.855 < 2.2e-16 ***
population         1   65352   65352  41.512  1.91e-10 ***
years:population   1  311219  311219 197.690 < 2.2e-16 ***
Residuals        896 1410556    1574                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emm_date_by_pop <- emmeans(fit, ~years | population)
pairs(emm_date_by_pop, adjust = "tukey")
population = protected:
 contrast                 estimate   SE  df t.ratio p.value
 before 2016 - after 2016    -11.2 4.72 896  -2.367  0.0181

population = unprotected:
 contrast                 estimate   SE  df t.ratio p.value
 before 2016 - after 2016    -94.4 3.57 896 -26.438  <.0001
emm_pop_by_date <- emmeans(fit, ~population | years)
pairs(emm_pop_by_date, adjust = "tukey")
years = before 2016:
 contrast                estimate   SE  df t.ratio p.value
 protected - unprotected     9.46 3.40 896   2.786  0.0054

years = after 2016:
 contrast                estimate   SE  df t.ratio p.value
 protected - unprotected   -73.78 4.85 896 -15.213  <.0001
# Figure 5
ggplot(dataPl, aes(x = population, y = M)) + geom_boxplot(aes(fill = years)) + scale_fill_brewer(palette = "BuPu") +
  labs(y = "distance to lineage M")

References

Machlowska, J., Gerula, D., Oleksa, A., Kok, H., McCormack, G., Valentine, A., … Tofilski, A. (2026). Fore wings of honey bees (apis mellifera) from northwestern europe. https://doi.org/10.5281/zenodo.17612437
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018a). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172184. https://doi.org/10.1007/s13592-017-0538-y
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018b). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172184. https://doi.org/10.5281/zenodo.7498540
Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M. A., … Tofilski, A. (2022). Collection of wing images for conservation of honey bees (Apis mellifera) biodiversity in Europe. Zenodo. https://doi.org/10.5281/zenodo.7244070
Tofilski, A. (2025). Measurements of honey bee (apis mellifera) wings originally provided by masaquiza et al. 2023. https://doi.org/10.5281/zenodo.17621032

Information about session

sessionInfo()
R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26200)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=en_US.UTF-8  LC_CTYPE=en_US.UTF-8    LC_MONETARY=en_US.UTF-8
[4] LC_NUMERIC=C            LC_TIME=en_US.UTF-8    

time zone: Europe/Warsaw
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggforce_0.5.0       ggpubr_0.6.2        sf_1.0-21          
 [4] terra_1.8-80        rnaturalearth_1.1.0 ggplot2_4.0.0      
 [7] emmeans_2.0.0       stringr_1.6.0       reshape2_1.4.4     
[10] IdentiFlyR_0.1.1    Morpho_2.13         geomorph_4.0.10    
[13] Matrix_1.7-3        rgl_1.3.24          RRPP_2.1.2         

loaded via a namespace (and not attached):
 [1] DBI_1.2.3               gridExtra_2.3           formatR_1.14           
 [4] sandwich_3.1-1          rlang_1.1.6             magrittr_2.0.4         
 [7] multcomp_1.4-29         e1071_1.7-16            compiler_4.5.0         
[10] vctrs_0.6.5             pkgconfig_2.0.3         fastmap_1.2.0          
[13] backports_1.5.0         labeling_0.4.3          rmarkdown_2.30         
[16] purrr_1.2.0             xfun_0.54.1             Rvcg_0.25              
[19] jsonlite_2.0.0          tweenr_2.0.3            jpeg_0.1-11            
[22] broom_1.0.10            parallel_4.5.0          R6_2.6.1               
[25] stringi_1.8.7           RColorBrewer_1.1-3      bezier_1.1.2           
[28] car_3.1-3               estimability_1.5.1      Rcpp_1.1.0             
[31] iterators_1.0.14        knitr_1.50              zoo_1.8-14             
[34] base64enc_0.1-3         splines_4.5.0           tidyselect_1.2.1       
[37] rstudioapi_0.17.1       abind_1.4-8             yaml_2.3.10            
[40] doParallel_1.0.17       codetools_0.2-20        minpack.lm_1.2-4       
[43] curl_7.0.0              lattice_0.22-6          tibble_3.3.0           
[46] plyr_1.8.9              withr_3.0.2             S7_0.2.0               
[49] coda_0.19-4.1           evaluate_1.0.5          survival_3.8-3         
[52] rnaturalearthdata_1.0.0 units_1.0-0             proxy_0.4-27           
[55] polyclip_1.10-7         fitdistrplus_1.2-4      xml2_1.4.1             
[58] mclust_6.1.2            pillar_1.11.1           carData_3.0-5          
[61] KernSmooth_2.23-26      foreach_1.5.2           generics_0.1.4         
[64] colorRamps_2.3.4        scales_1.4.0            xtable_1.8-4           
[67] class_7.3-23            glue_1.8.0              scatterplot3d_0.3-44   
[70] tools_4.5.0             ggsignif_0.6.4          mvtnorm_1.3-3          
[73] cowplot_1.2.0           grid_4.5.0              tidyr_1.3.1            
[76] ape_5.8-1               nlme_3.1-168            Formula_1.2-5          
[79] cli_3.6.5               dplyr_1.1.4             gtable_0.3.6           
[82] shapes_1.2.8            rstatix_0.7.3           digest_0.6.37          
[85] classInt_0.4-11         ggrepel_0.9.6           TH.data_1.1-4          
[88] htmlwidgets_1.6.4       farver_2.1.2            htmltools_0.5.8.1      
[91] lifecycle_1.0.4         MASS_7.3-65