Jupyter Notebook: height statistics of the best competition climbers
This is a static view of the Jupyter notebook I used for my calculations. Here you can download all the source files, I used Python 3.11. Some lines below are cut due to the small width of the blog, if you want you can open the html file from the source files directly in your browser.
Initial imports and a function downloading height¶
%matplotlib inline
import bs4
import requests
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
from scipy import stats
def get_height(personal_url):
height = np.nan
html_content = requests.get(personal_url).text
soup = bs4.BeautifulSoup(html_content, "lxml")
info = soup.find('div', class_="personal-info")
if info is not None:
exists = False
for p in info.find_all('p'):
if 'HEIGHT' in p.string:
exists = True
continue
elif exists == True:
height = int(p.string)
break
return height
# these links work as per November 2023, they might have changed though
anraku_url = 'https://www.ifsc-climbing.org/index.php?option=com_ifsc&task=athlete.display&id=13040'
song_url = 'https://www.ifsc-climbing.org/index.php?option=com_ifsc&task=athlete.display&id=14018'
hanke_url = 'https://www.ifsc-climbing.org/index.php?option=com_ifsc&task=athlete.display&id=2014'
martin_url = 'https://www.ifsc-climbing.org/index.php?option=com_ifsc&task=athlete.display&id=531'
assert get_height(anraku_url) == 168
assert get_height(song_url) == 168
assert get_height(hanke_url) == 167
assert get_height(martin_url) is np.nan
print('No errors downloading heights')
# rankings were downloaded as html tables from here:
# https://www.ifsc-climbing.org/index.php/world-competition/ranking
No errors downloading heights
Preparing a dataframe for the world ranking¶
data_path = Path('ifsc_data/worldranking.csv')
if data_path.is_file():
# if the csv alredy exists we can skip the whole process
df = pd.read_csv(data_path)
else:
df = pd.DataFrame(columns=['sex', 'full_name', 'height', 'country', 'rank'])
for sex in ['M', 'F']:
gender = "men" if sex == "M" else "women"
with open(f'ifsc_data/{gender}_worldranking.html', 'r') as file:
table = bs4.BeautifulSoup(file, 'lxml')
rows = table.find('tbody').find_all('tr')
for row in rows:
df_row = {'sex': sex}
cols = row.find_all('td')
df_row['rank'] = int(cols[0].find('p').string)
df_row['country'] = cols[3].find('a').string
col_one = cols[1].find('a')
col_two = cols[2].find('a')
df_row['full_name'] = ' '.join([col_one.string.lower(), col_two.string.lower()])
existing_rows = df.loc[df['full_name'] == df_row['full_name']]
if len(existing_rows) > 0:
df_row['height'] = existing_rows.iloc[0]['height']
print(' Using saved height for', df_row['full_name'], ':', df_row['height'])
else:
if col_one.get('href') != col_two.get('href'):
print(f'WARNING: Two different urls for {full_name}')
url = col_one.get('href')
print(' Downloading height for', df_row['full_name'])
df_row['height'] = get_height(url)
df.loc[len(df)] = df_row
df.to_csv(data_path, index=False)
df_rank = pd.read_csv('ifsc_data/worldranking.csv')
print('No. of duplicates:', np.count_nonzero(df_rank.duplicated())) # just to be sure
df_rank.info()
No. of duplicates: 0 <class 'pandas.core.frame.DataFrame'> RangeIndex: 218 entries, 0 to 217 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 218 non-null object 1 full_name 218 non-null object 2 height 148 non-null float64 3 country 218 non-null object 4 rank 218 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 8.6+ KB
Analysis of the world ranking data for men¶
df_m = df_rank.loc[df['sex'] == 'M']
df_m.info()
<class 'pandas.core.frame.DataFrame'> Index: 106 entries, 0 to 105 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 106 non-null object 1 full_name 106 non-null object 2 height 72 non-null float64 3 country 106 non-null object 4 rank 106 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 5.0+ KB
df_m.describe()
height | rank | |
---|---|---|
count | 72.000000 | 106.000000 |
mean | 174.847222 | 53.481132 |
std | 6.368186 | 30.712563 |
min | 162.000000 | 1.000000 |
25% | 170.000000 | 27.250000 |
50% | 175.000000 | 53.500000 |
75% | 179.000000 | 79.750000 |
max | 198.000000 | 106.000000 |
df_m.loc[df_m['height'] > 184]
sex | full_name | height | country | rank | |
---|---|---|---|---|---|
9 | M | adam ondra | 186.0 | CZE | 10 |
11 | M | meichi narasaki | 188.0 | JPN | 12 |
15 | M | paul jenft | 198.0 | FRA | 16 |
df_m.loc[df_m['height'] < 165]
sex | full_name | height | country | rank | |
---|---|---|---|---|---|
13 | M | sascha lehmann | 164.0 | SUI | 14 |
14 | M | sean bailey | 163.0 | USA | 15 |
19 | M | shion omata | 162.0 | JPN | 20 |
52 | M | dillon countryman | 164.0 | USA | 53 |
mean = df_m['height'].mean()
std = df_m['height'].std()
bins = np.arange(np.floor(df_m['height'].min()), df_m['height'].max()+1, 1)
plt.hist(df_m['height'], bins=bins, color='deepskyblue', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.6, 0.93, f'Avg. height is {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of male climbers\nregistered in the IFSC World Ranking 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
null_m = df_m.loc[df_m['height'].isnull()]
not_null_m = df_m.loc[df_m['height'].notnull()]
avg_rank = round(df_m['rank'].mean())
avg_rank_nan = round(null_m['rank'].mean())
avg_rank_not_nan = round(not_null_m['rank'].mean())
print(f'Climbers in ranking: {len(null_m)+len(not_null_m)}; with height {len(not_null_m)}, without {len(null_m)}')
print(f'avg. rank: {avg_rank}; avg. rank of climbers w/out height: {avg_rank_nan},'+
f' and with height data: {avg_rank_not_nan}')
Climbers in ranking: 106; with height 72, without 34 avg. rank: 53; avg. rank of climbers w/out height: 82, and with height data: 40
Statistical tests for the distribution and comparison with the general population¶
I perform 3 tests here, which are repeated for other histograms as well. The first one is D’Agostino and Pearson’s test for normality to see if the distribution could be Gaussian. The second one is Kolmogorov-Smirnov test comparing the empirical distribution with a normal distribution of mean and standard deviation corresponding to those of the general population. This test is to see whether the best climbers could have the same height distribution as the whole society. The last test is Student's t-test comparing just the average value for climbers with the general population (for this test the assumption is that the distribution is normal, hence the first test).
# Paul Jenft is 10cm taller than the next tallest climber,
# so we can take him out as an outlier
df_m_clean = df_m.loc[(df_m['height'].notnull()) & (df_m['height'] < 195)]
stats.normaltest(df_m_clean['height'])
NormaltestResult(statistic=1.1058087166129469, pvalue=0.5752765724237572)
world_m_height_avg = 178.4 # source: https://doi.org/10.7554/eLife.20320.001
world_m_height_std = 7.59 # it's an average for men born between 1980 and 1994
stats.ks_1samp(df_m_clean['height'], stats.norm.cdf, args=(world_m_height_avg, world_m_height_std))
KstestResult(statistic=0.28539728488157, pvalue=1.3022112549577226e-05, statistic_location=179.0, statistic_sign=1)
stats.ttest_1samp(df_m_clean['height'], world_m_height_avg)
TtestResult(statistic=-5.658138174840393, pvalue=3.107588094982158e-07, df=70)
Statisticts for the top 50 climbers¶
df_m_top_50 = df_m.loc[df['rank'] <= 50]
df_m_top_50.info()
<class 'pandas.core.frame.DataFrame'> Index: 50 entries, 0 to 49 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 50 non-null object 1 full_name 50 non-null object 2 height 49 non-null float64 3 country 50 non-null object 4 rank 50 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 2.3+ KB
df_m_top_50.describe() # the average of top 50 is 174cm without Paul Jenft
height | rank | |
---|---|---|
count | 49.000000 | 50.00000 |
mean | 174.653061 | 25.50000 |
std | 6.796295 | 14.57738 |
min | 162.000000 | 1.00000 |
25% | 170.000000 | 13.25000 |
50% | 175.000000 | 25.50000 |
75% | 178.000000 | 37.75000 |
max | 198.000000 | 50.00000 |
mean = df_m_top_50['height'].mean()
std = df_m_top_50['height'].std()
bins = np.arange(np.floor(df_m_top_50['height'].min()), df_m_top_50['height'].max()+1, 1)
plt.hist(df_m_top_50['height'], bins=bins, color='deepskyblue', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.6, 0.93, f'Avg. height is {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of the top 50 male climbers')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
df_m_clean_50 = df_m_top_50.loc[(df_m_top_50['height'].notnull()) & (df_m_top_50['height'] < 195)]
stats.normaltest(df_m_clean_50['height'])
NormaltestResult(statistic=0.22144894346920682, pvalue=0.8951853638232864)
stats.ks_1samp(df_m_clean_50['height'], stats.norm.cdf, args=(world_m_height_avg, world_m_height_std))
KstestResult(statistic=0.3226625430975324, pvalue=6.0124285907116906e-05, statistic_location=179.0, statistic_sign=1)
stats.ttest_1samp(df_m_clean_50['height'], world_m_height_avg)
TtestResult(statistic=-4.93413218077492, pvalue=1.050704076544895e-05, df=47)
Countries with the most ranked climbers and their average height¶
df_m_country = df_m.groupby(['country'], as_index=False).agg({'full_name': 'count', 'height': 'mean'})
df_m_country = df_m_country.sort_values('full_name', ascending=False)
df_m_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
country | no. of climbers | avg. height | |
---|---|---|---|
21 | ITA | 7 | 174.800000 |
12 | GBR | 7 | 175.166667 |
35 | USA | 6 | 171.000000 |
2 | AUT | 6 | 178.000000 |
22 | JPN | 6 | 172.600000 |
30 | SLO | 6 | 178.333333 |
11 | FRA | 5 | 179.200000 |
23 | KOR | 5 | 173.000000 |
6 | CAN | 5 | 172.750000 |
20 | ISR | 5 | 168.333333 |
1 | AUS | 4 | 173.500000 |
13 | GER | 3 | 176.000000 |
4 | BRA | 3 | NaN |
16 | INA | 3 | 171.000000 |
3 | BEL | 3 | 174.000000 |
9 | CZE | 3 | 182.000000 |
26 | MEX | 2 | NaN |
31 | SUI | 2 | 169.500000 |
33 | SWE | 2 | 177.000000 |
18 | IRI | 2 | NaN |
14 | HKG | 2 | 172.000000 |
10 | ESP | 2 | 181.000000 |
8 | CHN | 2 | 173.000000 |
7 | CHI | 2 | 180.000000 |
5 | BUL | 2 | 172.000000 |
19 | IRL | 1 | 174.000000 |
17 | IND | 1 | NaN |
15 | HUN | 1 | NaN |
24 | LAT | 1 | 181.000000 |
25 | LUX | 1 | NaN |
27 | PER | 1 | NaN |
28 | RSA | 1 | 174.000000 |
29 | SGP | 1 | NaN |
32 | SVK | 1 | 175.000000 |
34 | THA | 1 | NaN |
0 | ARG | 1 | NaN |
Analysis of the world ranking data for women¶
df_f = df_rank.loc[df['sex'] == 'F']
df_f.info()
<class 'pandas.core.frame.DataFrame'> Index: 112 entries, 106 to 217 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 112 non-null object 1 full_name 112 non-null object 2 height 76 non-null float64 3 country 112 non-null object 4 rank 112 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 5.2+ KB
df_f.describe()
height | rank | |
---|---|---|
count | 76.000000 | 112.000000 |
mean | 163.289474 | 56.410714 |
std | 5.285996 | 32.394681 |
min | 150.000000 | 1.000000 |
25% | 160.000000 | 28.750000 |
50% | 163.000000 | 56.500000 |
75% | 166.000000 | 84.250000 |
max | 175.000000 | 112.000000 |
df_f.loc[df_f['height'] > 173]
sex | full_name | height | country | rank | |
---|---|---|---|---|---|
125 | F | stasa gejo | 175.0 | SRB | 20 |
143 | F | flavy cohaut | 174.0 | FRA | 38 |
146 | F | julija kruder | 175.0 | SLO | 41 |
df_f.loc[df_f['height'] < 153]
sex | full_name | height | country | rank | |
---|---|---|---|---|---|
114 | F | jain kim | 152.0 | KOR | 9 |
128 | F | laura rogora | 152.0 | ITA | 23 |
181 | F | chaeyeong kim | 150.0 | KOR | 75 |
mean = df_f['height'].mean()
std = df_f['height'].std()
bins = np.arange(np.floor(df_f['height'].min()), df_f['height'].max()+1, 1)
plt.hist(df_f['height'], bins=bins, color='tomato', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.05, 0.87, f'Avg. height\nis {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of female climbers\nregistered in the IFSC World Ranking 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
null_f = df_f.loc[df_f['height'].isnull()]
not_null_f = df_f.loc[df_f['height'].notnull()]
avg_rank = round(df_f['rank'].mean())
avg_rank_nan = round(null_f['rank'].mean())
avg_rank_not_nan = round(not_null_f['rank'].mean())
print(f'Climbers in ranking: {len(null_f)+len(not_null_f)}; with height {len(not_null_f)}, without {len(null_f)}')
print(f'avg. rank: {avg_rank}; avg. rank of climbers w/out height: {avg_rank_nan},'+
f' and with height data: {avg_rank_not_nan}')
Climbers in ranking: 112; with height 76, without 36 avg. rank: 56; avg. rank of climbers w/out height: 87, and with height data: 42
Statistical tests for the distribution and comparison with the general population¶
df_f_clean = df_f.loc[df_f['height'].notnull()]
stats.normaltest(df_f_clean['height'])
NormaltestResult(statistic=0.34554396819934946, pvalue=0.8413294294627298)
world_f_height_avg = 164.7 # source: https://doi.org/10.7554/eLife.20320.001
world_f_height_std = 7.07 # it's an average for women born between 1980 and 1994
stats.ks_1samp(df_f_clean['height'], stats.norm.cdf, args=(world_f_height_avg, world_f_height_std))
KstestResult(statistic=0.21991894003448065, pvalue=0.0010427677664592615, statistic_location=165.0, statistic_sign=1)
stats.ttest_1samp(df_f_clean['height'], world_f_height_avg)
TtestResult(statistic=-2.3262755376972337, pvalue=0.02270781036453741, df=75)
Statisticts for the top 50 climbers¶
df_f_top_50 = df_f.loc[df['rank'] <= 50]
df_f_top_50.info()
<class 'pandas.core.frame.DataFrame'> Index: 50 entries, 106 to 155 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 50 non-null object 1 full_name 50 non-null object 2 height 50 non-null float64 3 country 50 non-null object 4 rank 50 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 2.3+ KB
df_f_top_50.describe()
height | rank | |
---|---|---|
count | 50.000000 | 50.000000 |
mean | 163.580000 | 25.480000 |
std | 5.496158 | 14.578766 |
min | 152.000000 | 1.000000 |
25% | 160.000000 | 13.250000 |
50% | 163.000000 | 25.000000 |
75% | 166.750000 | 37.750000 |
max | 175.000000 | 50.000000 |
mean = df_f_top_50['height'].mean()
std = df_f_top_50['height'].std()
bins = np.arange(np.floor(df_f_top_50['height'].min()), df_f_top_50['height'].max()+1, 1)
plt.hist(df_f_top_50['height'], bins=bins, color='tomato', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.05, 0.87, f'Avg. height\nis {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of the top 50 female climbers')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
df_f_clean_50 = df_f_top_50.loc[df_f_top_50['height'].notnull()]
stats.normaltest(df_f_clean_50['height'])
NormaltestResult(statistic=0.3251394770115752, pvalue=0.8499568134403317)
stats.ks_1samp(df_f_clean_50['height'], stats.norm.cdf, args=(world_f_height_avg, world_f_height_std))
KstestResult(statistic=0.20307683477132277, pvalue=0.02766478274130235, statistic_location=165.0, statistic_sign=1)
stats.ttest_1samp(df_f_clean_50['height'], world_f_height_avg)
TtestResult(statistic=-1.4409330376615785, pvalue=0.15596368735799032, df=49)
Countries with the most ranked climbers and their average height¶
df_f_country = df_f.groupby(['country'], as_index=False).agg({'full_name': 'count', 'height': 'mean'})
df_f_country = df_f_country.sort_values('full_name', ascending=False)
df_f_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
country | no. of climbers | avg. height | |
---|---|---|---|
38 | USA | 9 | 162.833333 |
32 | SLO | 8 | 167.125000 |
13 | FRA | 7 | 163.714286 |
22 | JPN | 7 | 162.571429 |
15 | GER | 5 | 164.750000 |
24 | KOR | 5 | 156.250000 |
10 | CZE | 4 | 162.000000 |
14 | GBR | 4 | 161.333333 |
20 | ISR | 4 | 166.500000 |
7 | CHI | 4 | 160.666667 |
6 | CAN | 4 | 161.000000 |
4 | BRA | 4 | 164.000000 |
21 | ITA | 4 | 159.000000 |
25 | MEX | 3 | NaN |
1 | AUS | 3 | 173.000000 |
34 | SUI | 3 | 163.333333 |
37 | UKR | 3 | 159.000000 |
0 | ARG | 3 | 157.000000 |
2 | AUT | 3 | 165.000000 |
8 | CHN | 3 | 160.000000 |
18 | IRI | 2 | 161.000000 |
11 | ESP | 2 | NaN |
30 | PUR | 1 | NaN |
3 | BEL | 1 | 166.000000 |
36 | TPE | 1 | 170.000000 |
35 | SVK | 1 | 169.000000 |
5 | BUL | 1 | 162.000000 |
33 | SRB | 1 | 175.000000 |
31 | RSA | 1 | NaN |
29 | PER | 1 | 168.000000 |
19 | ISL | 1 | 171.000000 |
28 | NOR | 1 | NaN |
27 | NED | 1 | 168.000000 |
26 | MKD | 1 | NaN |
9 | CRO | 1 | NaN |
23 | KAZ | 1 | NaN |
12 | FIN | 1 | NaN |
16 | INA | 1 | 158.000000 |
17 | IND | 1 | NaN |
39 | VEN | 1 | NaN |
Preparing a dataframe for the world cups¶
data_path = Path('ifsc_data/world_cups.csv')
if data_path.is_file():
df = pd.read_csv(data_path)
else:
df = pd.DataFrame(columns=['year', 'category', 'sex', 'full_name', 'height', 'country', 'rank'])
years = list(range(2013, 2024))[::-1]
for year in years:
if len(df.loc[df['year'] == year]) > 0 or year == 2020:
# I assume that all the data for this year has been loeaded before
continue
for sex in ['M', 'F']:
for cat in ['lead', 'boulder']:
gender = "men" if sex == "M" else "women"
print(f"Loading year {year} for {gender}'s {cat}")
with open(f'ifsc_data/{gender}_{cat}_{year}.html', 'r') as file:
table = bs4.BeautifulSoup(file, 'lxml')
rows = table.find('tbody').find_all('tr')
for row in rows:
df_row = {'year': year, 'category': cat, 'sex': sex}
cols = row.find_all('td')
df_row['rank'] = int(cols[0].find('p').string)
df_row['country'] = cols[3].find('a').string
col_one = cols[1].find('a')
col_two = cols[2].find('a')
df_row['full_name'] = ' '.join([col_one.string.lower(), col_two.string.lower()])
existing_rows = df.loc[df['full_name'] == df_row['full_name']]
if len(existing_rows) > 0:
df_row['height'] = existing_rows.iloc[0]['height']
print(' Using saved height for', df_row['full_name'], ':', df_row['height'])
else:
if col_one.get('href') != col_two.get('href'):
print(f'WARNING: Two different urls for {full_name}')
url = col_one.get('href')
df_row['height'] = get_height(url)
df.loc[len(df)] = df_row
df.to_csv(data_path, index=False)
df_cup = pd.read_csv('ifsc_data/world_cups.csv')
print('No. of duplicates:', np.count_nonzero(df_cup.duplicated())) # just to be sure
df_cup.info()
No. of duplicates: 0 <class 'pandas.core.frame.DataFrame'> RangeIndex: 4358 entries, 0 to 4357 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 4358 non-null int64 1 category 4358 non-null object 2 sex 4358 non-null object 3 full_name 4358 non-null object 4 height 1849 non-null float64 5 country 4358 non-null object 6 rank 4358 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 238.5+ KB
Basic statistics for both categories for men¶
df_cup_m = df_cup.loc[df_cup['sex'] == 'M']
df_cup_m.info()
<class 'pandas.core.frame.DataFrame'> Index: 2186 entries, 0 to 4143 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 2186 non-null int64 1 category 2186 non-null object 2 sex 2186 non-null object 3 full_name 2186 non-null object 4 height 921 non-null float64 5 country 2186 non-null object 6 rank 2186 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 136.6+ KB
column = df_cup_m.copy()['category']
df_cup_m.insert(0, 'category_dup', column, allow_duplicates=False)
df_cup_m_agg = df_cup_m.groupby(['full_name'], as_index=False)
df_cup_m_agg = df_cup_m_agg.agg({'full_name': 'first', 'country': 'first', 'year': 'count',
'category': lambda x: (x == 'lead').sum(),
'category_dup': lambda x: (x == 'boulder').sum(),
'height': 'mean', 'rank': 'mean'})
df_cup_m_agg = df_cup_m_agg.sort_values('year', ascending=False)
df_cup_m_agg = df_cup_m_agg.rename(columns={'year': 'times in ranking', 'rank' : 'avg. rank',
'category': 'in lead', 'category_dup': 'in boulder'})
df_cup_m_agg.info()
<class 'pandas.core.frame.DataFrame'> Index: 705 entries, 256 to 704 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_name 705 non-null object 1 country 705 non-null object 2 times in ranking 705 non-null int64 3 in lead 705 non-null int64 4 in boulder 705 non-null int64 5 height 171 non-null float64 6 avg. rank 705 non-null float64 dtypes: float64(2), int64(3), object(2) memory usage: 44.1+ KB
df_cup_m_agg.head(20)
full_name | country | times in ranking | in lead | in boulder | height | avg. rank | |
---|---|---|---|---|---|---|---|
256 | jakob schubert | AUT | 19 | 10 | 9 | 176.0 | 12.157895 |
550 | sean mccoll | CAN | 18 | 8 | 10 | 169.0 | 23.277778 |
345 | kokoro fujii | JPN | 17 | 7 | 10 | 176.0 | 16.764706 |
150 | domen skofic | SLO | 15 | 9 | 6 | 177.0 | 28.933333 |
292 | jongwon chon | KOR | 15 | 6 | 9 | 177.0 | 29.933333 |
384 | marcello bombardi | ITA | 15 | 10 | 5 | 177.0 | 42.333333 |
424 | michael piccolruaz | ITA | 15 | 5 | 10 | 178.0 | 45.066667 |
222 | hannes puman | SWE | 15 | 10 | 5 | 177.0 | 47.333333 |
637 | tomoa narasaki | JPN | 15 | 6 | 9 | 170.0 | 17.000000 |
2 | adam ondra | CZE | 15 | 10 | 5 | 186.0 | 14.800000 |
271 | jernej kruder | SLO | 14 | 4 | 10 | 179.0 | 35.214286 |
548 | sean bailey | USA | 13 | 7 | 6 | 163.0 | 17.923077 |
672 | yannick flohé | GER | 13 | 7 | 6 | 178.0 | 30.461538 |
546 | sascha lehmann | SUI | 13 | 8 | 5 | 164.0 | 30.384615 |
461 | nicolas collin | BEL | 13 | 8 | 5 | 179.0 | 41.923077 |
678 | yoshiyuki ogata | JPN | 13 | 4 | 9 | 172.0 | 18.230769 |
16 | alex khazanov | ISR | 13 | 4 | 9 | NaN | 58.923077 |
266 | jan hojer | GER | 13 | 5 | 8 | 188.0 | 21.769231 |
571 | simon lorenzi | BEL | 12 | 7 | 5 | 168.0 | 50.500000 |
591 | stefano ghisolfi | ITA | 12 | 10 | 2 | 169.0 | 15.916667 |
df_cup_m_agg.describe()
times in ranking | in lead | in boulder | height | avg. rank | |
---|---|---|---|---|---|
count | 705.000000 | 705.000000 | 705.000000 | 171.000000 | 705.000000 |
mean | 3.100709 | 1.506383 | 1.594326 | 174.157895 | 74.807740 |
std | 3.114722 | 2.024783 | 1.967050 | 6.184771 | 33.989163 |
min | 1.000000 | 0.000000 | 0.000000 | 155.000000 | 1.000000 |
25% | 1.000000 | 0.000000 | 0.000000 | 170.000000 | 50.000000 |
50% | 2.000000 | 1.000000 | 1.000000 | 175.000000 | 73.666667 |
75% | 4.000000 | 2.000000 | 2.000000 | 178.000000 | 97.000000 |
max | 19.000000 | 10.000000 | 10.000000 | 198.000000 | 160.000000 |
df_cup_m_agg_country = df_cup_m_agg.groupby(['country'], as_index=False)
df_cup_m_agg_country = df_cup_m_agg_country.agg({'full_name': 'count', 'height': 'mean'})
df_cup_m_agg_country = df_cup_m_agg_country.sort_values('full_name', ascending=False)
df_cup_m_agg_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
df_cup_m_agg_country.head(30)
country | full_name | height | |
---|---|---|---|
27 | JPN | 62 | 172.090909 |
15 | FRA | 57 | 176.687500 |
58 | USA | 56 | 175.533333 |
18 | GER | 32 | 176.200000 |
7 | CAN | 29 | 172.750000 |
29 | KOR | 29 | 170.250000 |
47 | RUS | 28 | 174.500000 |
2 | AUT | 28 | 178.428571 |
16 | GBR | 26 | 175.166667 |
51 | SUI | 23 | 169.571429 |
26 | ITA | 23 | 171.625000 |
49 | SLO | 21 | 176.363636 |
9 | CHN | 20 | 173.000000 |
13 | ESP | 17 | 178.250000 |
21 | INA | 15 | 168.500000 |
1 | AUS | 14 | 176.000000 |
23 | IRI | 14 | 173.000000 |
10 | CZE | 12 | 178.333333 |
4 | BEL | 12 | 174.750000 |
35 | MEX | 12 | NaN |
22 | IND | 12 | NaN |
25 | ISR | 11 | 171.250000 |
19 | HKG | 11 | 172.000000 |
39 | NOR | 9 | 175.000000 |
57 | UKR | 9 | 179.000000 |
42 | POL | 9 | 170.500000 |
37 | NED | 9 | 173.500000 |
8 | CHI | 8 | 180.000000 |
48 | SGP | 7 | NaN |
53 | SWE | 7 | 177.000000 |
Analysis of the lead world cup data for men¶
df_m_lead = df_cup.loc[(df_cup['sex'] == 'M') & (df_cup['category'] == 'lead')]
df_m_lead.info()
<class 'pandas.core.frame.DataFrame'> Index: 1062 entries, 0 to 4028 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 1062 non-null int64 1 category 1062 non-null object 2 sex 1062 non-null object 3 full_name 1062 non-null object 4 height 490 non-null float64 5 country 1062 non-null object 6 rank 1062 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 66.4+ KB
df_m_lead.describe()
# note that in the numbers below a climber who was ranked for several years
# is taken into account that many times in the calculations
year | height | rank | |
---|---|---|---|
count | 1062.000000 | 490.000000 | 1062.000000 |
mean | 2018.275895 | 174.181633 | 56.488701 |
std | 3.440920 | 6.616710 | 36.212970 |
min | 2013.000000 | 155.000000 | 1.000000 |
25% | 2015.000000 | 169.250000 | 27.000000 |
50% | 2018.000000 | 175.000000 | 53.500000 |
75% | 2022.000000 | 178.000000 | 80.000000 |
max | 2023.000000 | 198.000000 | 160.000000 |
Analysis of aggregated lead rankings for all years¶
df_m_lead_agg = df_m_lead.groupby(['full_name'], as_index=False)
df_m_lead_agg = df_m_lead_agg.agg({'full_name': 'first', 'country': 'first', 'year': 'count',
'height': 'mean', 'rank': 'mean'})
df_m_lead_agg = df_m_lead_agg.sort_values('year', ascending=False)
df_m_lead_agg = df_m_lead_agg.rename(columns={'year': 'years in ranking', 'rank' : 'avg. rank'})
df_m_lead_agg.info()
<class 'pandas.core.frame.DataFrame'> Index: 431 entries, 0 to 430 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_name 431 non-null object 1 country 431 non-null object 2 years in ranking 431 non-null int64 3 height 140 non-null float64 4 avg. rank 431 non-null float64 dtypes: float64(2), int64(1), object(2) memory usage: 20.2+ KB
df_m_lead_agg.head(20)
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
0 | adam ondra | CZE | 10 | 186.0 | 17.000000 |
237 | martin bergant | SLO | 10 | 182.0 | 44.200000 |
335 | sebastian halenke | GER | 10 | 177.0 | 19.100000 |
232 | marcello bombardi | ITA | 10 | 177.0 | 24.900000 |
358 | stefano ghisolfi | ITA | 10 | 169.0 | 12.000000 |
131 | hannes puman | SWE | 10 | 177.0 | 34.900000 |
154 | jakob schubert | AUT | 10 | 176.0 | 8.800000 |
86 | domen skofic | SLO | 9 | 177.0 | 8.111111 |
61 | christoph hanke | GER | 9 | 167.0 | 39.222222 |
262 | milan preskar | SLO | 8 | 172.0 | 58.625000 |
330 | sascha lehmann | SUI | 8 | 164.0 | 17.500000 |
133 | hanwool kim | KOR | 8 | NaN | 34.750000 |
81 | dimitri vogt | SUI | 8 | 158.0 | 75.375000 |
283 | nicolas collin | BEL | 8 | 179.0 | 42.875000 |
334 | sean mccoll | CAN | 8 | 169.0 | 15.875000 |
244 | masahiro higuchi | JPN | 8 | 169.0 | 14.375000 |
250 | max rudigier | AUT | 7 | NaN | 35.285714 |
273 | nao monchois | FRA | 7 | NaN | 52.000000 |
209 | kokoro fujii | JPN | 7 | 176.0 | 24.285714 |
246 | mathias posch | AUT | 7 | 171.0 | 59.142857 |
df_m_lead_agg.describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 431.000000 | 140.000000 | 431.000000 |
mean | 2.464037 | 173.964286 | 71.940380 |
std | 2.084936 | 6.441112 | 35.360244 |
min | 1.000000 | 155.000000 | 1.000000 |
25% | 1.000000 | 170.000000 | 47.750000 |
50% | 2.000000 | 175.000000 | 69.000000 |
75% | 3.000000 | 178.000000 | 90.750000 |
max | 10.000000 | 198.000000 | 160.000000 |
df_m_lead_agg.loc[df_m_lead_agg['years in ranking'] >= 5].describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 63.000000 | 38.000000 | 63.000000 |
mean | 6.746032 | 173.605263 | 36.988946 |
std | 1.585908 | 6.499644 | 18.582166 |
min | 5.000000 | 158.000000 | 6.857143 |
25% | 5.000000 | 169.000000 | 20.557143 |
50% | 7.000000 | 174.000000 | 37.142857 |
75% | 7.500000 | 177.750000 | 51.700000 |
max | 10.000000 | 188.000000 | 75.375000 |
df_m_lead_agg.loc[df_m_lead_agg['height'] > 186]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
159 | jan hojer | GER | 5 | 188.0 | 42.40 |
295 | paul jenft | FRA | 4 | 198.0 | 40.25 |
253 | meichi narasaki | JPN | 4 | 188.0 | 36.50 |
215 | louis gundolf | AUT | 4 | 188.0 | 76.75 |
62 | christoph schweiger | GER | 1 | 187.0 | 117.00 |
df_m_lead_agg.loc[df_m_lead_agg['height'] < 163]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
81 | dimitri vogt | SUI | 8 | 158.0 | 75.375 |
146 | hyunbin min | KOR | 4 | 162.0 | 23.000 |
121 | giovanni placci | ITA | 2 | 155.0 | 42.000 |
346 | shion omata | JPN | 1 | 162.0 | 4.000 |
400 | veddriq leonardo | INA | 1 | 162.0 | 84.000 |
mean = df_m_lead_agg['height'].mean()
std = df_m_lead_agg['height'].std()
bins = np.arange(np.floor(df_m_lead_agg['height'].min()), df_m_lead_agg['height'].max()+1, 1)
plt.hist(df_m_lead_agg['height'], bins=bins, color='deepskyblue', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.61, 0.93, f'Avg. height is {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of male climbers registered in\nthe IFSC '+
'Lead World Cup ranking between 2013 and 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
df_m_lead_agg_clean = df_m_lead_agg.loc[(df_m_lead_agg['height'].notnull())
& (df_m_lead_agg['height'] < 195) & (df_m_lead_agg['height'] > 156)]
stats.normaltest(df_m_lead_agg_clean['height'])
NormaltestResult(statistic=0.064157677419489, pvalue=0.968430229279705)
stats.ks_1samp(df_m_lead_agg_clean['height'], stats.norm.cdf, args=(world_m_height_avg, world_m_height_std))
KstestResult(statistic=0.31086997037047653, pvalue=2.4086640889450246e-12, statistic_location=178.0, statistic_sign=1)
stats.ttest_1samp(df_m_lead_agg_clean['height'], world_m_height_avg)
TtestResult(statistic=-8.849233874164154, pvalue=3.960169063117888e-15, df=137)
df_m_lead_agg_country = df_m_lead_agg.groupby(['country'], as_index=False)
df_m_lead_agg_country = df_m_lead_agg_country.agg({'full_name': 'count', 'height': 'mean'})
df_m_lead_agg_country = df_m_lead_agg_country.sort_values('full_name', ascending=False)
df_m_lead_agg_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
country | no. of climbers | avg. height | |
---|---|---|---|
23 | JPN | 40 | 172.833333 |
12 | FRA | 39 | 176.307692 |
25 | KOR | 25 | 168.333333 |
47 | USA | 24 | 174.833333 |
37 | RUS | 23 | 174.500000 |
39 | SLO | 21 | 176.363636 |
2 | AUT | 18 | 178.428571 |
22 | ITA | 17 | 172.000000 |
14 | GER | 16 | 177.000000 |
8 | CHN | 16 | 173.000000 |
13 | GBR | 15 | 175.166667 |
17 | INA | 15 | 168.500000 |
41 | SUI | 13 | 167.800000 |
6 | CAN | 11 | 172.750000 |
10 | ESP | 11 | 181.000000 |
15 | HKG | 9 | 172.000000 |
3 | BEL | 9 | 174.750000 |
9 | CZE | 9 | 182.000000 |
30 | NOR | 8 | 175.000000 |
7 | CHI | 7 | 180.000000 |
21 | ISR | 6 | 168.333333 |
1 | AUS | 6 | 176.000000 |
27 | MEX | 5 | NaN |
33 | POL | 5 | 170.500000 |
44 | THA | 5 | NaN |
28 | NED | 5 | 165.000000 |
18 | IND | 5 | NaN |
46 | UKR | 5 | NaN |
19 | IRI | 4 | NaN |
43 | SWE | 4 | 177.000000 |
4 | BRA | 4 | NaN |
16 | HUN | 3 | 176.000000 |
42 | SVK | 3 | 175.000000 |
5 | BUL | 3 | 172.000000 |
45 | TPE | 2 | NaN |
0 | ARG | 2 | NaN |
48 | VEN | 2 | NaN |
36 | RSA | 2 | 174.000000 |
34 | POR | 2 | NaN |
31 | PAK | 2 | 170.000000 |
29 | NEP | 2 | NaN |
38 | SGP | 1 | NaN |
35 | ROU | 1 | NaN |
40 | SRB | 1 | 175.000000 |
32 | PER | 1 | NaN |
26 | LUX | 1 | NaN |
20 | IRL | 1 | 174.000000 |
11 | FIN | 1 | 180.000000 |
24 | KAZ | 1 | 170.000000 |
Changes in the statistics over the years¶
years = list(range(2013, 2024))
years.pop(years.index(2020))
l_height_list = []
l_height_std_list = []
for year in years:
heights = df_m_lead.loc[df_m_lead['year'] == year]['height']
l_height_list.append(heights.mean())
l_height_std_list.append(heights.std())
avg = round(l_height_list[-1], 1)
std = round(l_height_std_list[-1], 1)
all_ = len(heights)
nans = len(heights.loc[heights.isnull()])
perc = round(100.0*nans/all_, 1)
print(f'Avg. height {year}: {avg} +- {std} '+
f'({all_-nans} records, {nans} NaN, {perc}% of NaNs in {all_} climbers)')
Avg. height 2013: 173.8 +- 6.2 (15 records, 82 NaN, 84.5% of NaNs in 97 climbers) Avg. height 2014: 172.6 +- 6.3 (23 records, 82 NaN, 78.1% of NaNs in 105 climbers) Avg. height 2015: 173.4 +- 7.4 (23 records, 66 NaN, 74.2% of NaNs in 89 climbers) Avg. height 2016: 173.2 +- 6.8 (28 records, 64 NaN, 69.6% of NaNs in 92 climbers) Avg. height 2017: 173.6 +- 6.7 (39 records, 63 NaN, 61.8% of NaNs in 102 climbers) Avg. height 2018: 173.6 +- 6.8 (47 records, 47 NaN, 50.0% of NaNs in 94 climbers) Avg. height 2019: 175.1 +- 6.9 (56 records, 27 NaN, 32.5% of NaNs in 83 climbers) Avg. height 2021: 175.1 +- 6.6 (68 records, 18 NaN, 20.9% of NaNs in 86 climbers) Avg. height 2022: 174.5 +- 6.4 (97 records, 63 NaN, 39.4% of NaNs in 160 climbers) Avg. height 2023: 174.2 +- 6.5 (94 records, 60 NaN, 39.0% of NaNs in 154 climbers)
for year in years:
df_y = df_m_lead.loc[df_m_lead['year'] == year]
avg_rank = round(df_y['rank'].mean())
avg_rank_nan = round(df_y.loc[df_y['height'].isnull()]['rank'].mean())
avg_rank_not_nan = round(df_y.loc[df_y['height'].notnull()]['rank'].mean())
print(f'{year} avg. rank: {avg_rank}; avg. rank of climbers w/out height {avg_rank_nan},'+
f' and with height data {avg_rank_not_nan}')
2013 avg. rank: 49; avg. rank of climbers w/out height 52, and with height data 32 2014 avg. rank: 53; avg. rank of climbers w/out height 55, and with height data 43 2015 avg. rank: 45; avg. rank of climbers w/out height 48, and with height data 37 2016 avg. rank: 46; avg. rank of climbers w/out height 50, and with height data 39 2017 avg. rank: 51; avg. rank of climbers w/out height 58, and with height data 40 2018 avg. rank: 47; avg. rank of climbers w/out height 51, and with height data 44 2019 avg. rank: 42; avg. rank of climbers w/out height 48, and with height data 39 2021 avg. rank: 43; avg. rank of climbers w/out height 57, and with height data 40 2022 avg. rank: 80; avg. rank of climbers w/out height 112, and with height data 60 2023 avg. rank: 77; avg. rank of climbers w/out height 110, and with height data 55
outlier_marker = dict(markerfacecolor='none', marker='o', markeredgecolor='orangered', alpha=0.7)
meanprops = dict(markerfacecolor='none', marker='x', markeredgecolor='gray', alpha=0.9, markersize=4)
df_m_lead.boxplot(column='height', by='year', flierprops=outlier_marker, showmeans=True, meanprops=meanprops)
ax = plt.gca()
ax.grid(linestyle='--', alpha=0.5)
ax.grid(axis='x')
ax.set_yticks(np.arange(155, 200, 5))
plt.suptitle('')
plt.title('Box plot of height distribution of male climbers registered\n'+
'in the IFSC Lead World Cup ranking for different years')
plt.xlabel('year')
plt.ylabel('height [cm]')
plt.show()
Analysis of the bouldring world cup data for men¶
df_m_boulder = df_cup.loc[(df_cup['sex'] == 'M') & (df_cup['category'] == 'boulder')]
df_m_boulder.info()
<class 'pandas.core.frame.DataFrame'> Index: 1124 entries, 160 to 4143 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 1124 non-null int64 1 category 1124 non-null object 2 sex 1124 non-null object 3 full_name 1124 non-null object 4 height 431 non-null float64 5 country 1124 non-null object 6 rank 1124 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 70.2+ KB
df_m_boulder.describe()
# note that in the numbers below a climber who was ranked for several years
# is taken into account that many times in the calculations
year | height | rank | |
---|---|---|---|
count | 1124.000000 | 431.000000 | 1124.000000 |
mean | 2017.981317 | 175.345708 | 59.982206 |
std | 3.476240 | 5.775145 | 37.950331 |
min | 2013.000000 | 162.000000 | 1.000000 |
25% | 2015.000000 | 172.000000 | 29.000000 |
50% | 2017.000000 | 176.000000 | 56.000000 |
75% | 2022.000000 | 178.000000 | 85.250000 |
max | 2023.000000 | 198.000000 | 149.000000 |
Analysis of aggregated bouldering rankings for all years¶
df_m_boulder_agg = df_m_boulder.groupby(['full_name'], as_index=False)
df_m_boulder_agg = df_m_boulder_agg.agg({'full_name': 'first', 'country': 'first', 'year': 'count',
'height': 'mean', 'rank': 'mean'})
df_m_boulder_agg = df_m_boulder_agg.sort_values('year', ascending=False)
df_m_boulder_agg = df_m_boulder_agg.rename(columns={'year': 'years in ranking', 'rank' : 'avg. rank'})
df_m_boulder_agg.info()
<class 'pandas.core.frame.DataFrame'> Index: 482 entries, 232 to 481 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_name 482 non-null object 1 country 482 non-null object 2 years in ranking 482 non-null int64 3 height 127 non-null float64 4 avg. rank 482 non-null float64 dtypes: float64(2), int64(1), object(2) memory usage: 22.6+ KB
df_m_boulder_agg.head(20)
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
232 | kokoro fujii | JPN | 10 | 176.0 | 11.500000 |
287 | michael piccolruaz | ITA | 10 | 178.0 | 36.400000 |
378 | sean mccoll | CAN | 10 | 169.0 | 29.200000 |
181 | jernej kruder | SLO | 10 | 179.0 | 18.200000 |
432 | tomoa narasaki | JPN | 9 | 170.0 | 8.000000 |
169 | jakob schubert | AUT | 9 | 176.0 | 15.888889 |
289 | mickael mawem | FRA | 9 | 179.0 | 27.333333 |
462 | yoshiyuki ogata | JPN | 9 | 172.0 | 19.333333 |
198 | jongwon chon | KOR | 9 | 177.0 | 7.666667 |
11 | alex khazanov | ISR | 9 | NaN | 40.111111 |
305 | nathaniel coleman | USA | 9 | 182.0 | 48.444444 |
349 | rei sugimoto | JPN | 9 | 172.0 | 24.333333 |
265 | martin stranik | CZE | 8 | 178.0 | 50.250000 |
433 | tomoaki takata | JPN | 8 | 175.0 | 29.250000 |
304 | nathan phillips | GBR | 8 | NaN | 42.375000 |
255 | manuel cornu | FRA | 8 | 177.0 | 20.000000 |
177 | jan hojer | GER | 8 | 188.0 | 8.875000 |
140 | gregor vezonik | SLO | 8 | 176.0 | 38.875000 |
382 | sergii topishko | UKR | 8 | NaN | 47.375000 |
223 | kevin heiniger | SUI | 7 | NaN | 71.714286 |
df_m_boulder_agg.describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 482.000000 | 127.000000 | 482.000000 |
mean | 2.331950 | 174.574803 | 75.544836 |
std | 1.984811 | 5.902004 | 36.109175 |
min | 1.000000 | 162.000000 | 1.000000 |
25% | 1.000000 | 170.000000 | 48.000000 |
50% | 1.000000 | 175.000000 | 74.000000 |
75% | 3.000000 | 178.000000 | 103.750000 |
max | 10.000000 | 198.000000 | 149.000000 |
df_m_boulder_agg.loc[df_m_boulder_agg['years in ranking'] >= 5].describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 67.000000 | 37.000000 | 67.000000 |
mean | 6.537313 | 175.648649 | 40.344942 |
std | 1.636037 | 5.735931 | 22.389098 |
min | 5.000000 | 163.000000 | 7.666667 |
25% | 5.000000 | 172.000000 | 23.250000 |
50% | 6.000000 | 176.000000 | 38.285714 |
75% | 8.000000 | 178.000000 | 53.600000 |
max | 10.000000 | 188.000000 | 100.500000 |
df_m_boulder_agg.loc[df_m_boulder_agg['height'] > 185]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
177 | jan hojer | GER | 8 | 188.0 | 8.875000 |
283 | meichi narasaki | JPN | 6 | 188.0 | 27.166667 |
2 | adam ondra | CZE | 5 | 186.0 | 10.400000 |
327 | paul jenft | FRA | 3 | 198.0 | 34.000000 |
75 | christoph schweiger | GER | 3 | 187.0 | 50.666667 |
df_m_boulder_agg.loc[df_m_boulder_agg['height'] < 165]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
377 | sean bailey | USA | 6 | 163.0 | 22.333333 |
375 | sascha lehmann | SUI | 5 | 164.0 | 51.000000 |
388 | shion omata | JPN | 1 | 162.0 | 113.000000 |
354 | ritsu kayotani | JPN | 1 | 163.0 | 12.000000 |
96 | dillon countryman | USA | 1 | 164.0 | 41.000000 |
mean = df_m_boulder_agg['height'].mean()
std = df_m_boulder_agg['height'].std()
bins = np.arange(np.floor(df_m_boulder_agg['height'].min()), df_m_boulder_agg['height'].max()+1, 1)
plt.hist(df_m_boulder_agg['height'], bins=bins, color='deepskyblue', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.61, 0.93, f'Avg. height is {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of male climbers registered in the IFSC\n'+
'Bouldering World Cup ranking between 2013 and 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
df_m_boulder_agg_clean = df_m_boulder_agg.loc[(df_m_boulder_agg['height'].notnull())
& (df_m_boulder_agg['height'] < 195)]
stats.normaltest(df_m_boulder_agg_clean['height'])
NormaltestResult(statistic=0.6821195834831839, pvalue=0.7110163940269245)
stats.ks_1samp(df_m_boulder_agg_clean['height'], stats.norm.cdf, args=(world_m_height_avg, world_m_height_std))
KstestResult(statistic=0.29879267568448625, pvalue=1.8028202950648584e-10, statistic_location=178.0, statistic_sign=1)
stats.ttest_1samp(df_m_boulder_agg_clean['height'], world_m_height_avg)
TtestResult(statistic=-8.127721561300902, pvalue=3.6372076149100477e-13, df=125)
df_m_boulder_agg_country = df_m_boulder_agg.groupby(['country'], as_index=False)
df_m_boulder_agg_country = df_m_boulder_agg_country.agg({'full_name': 'count', 'height': 'mean'})
df_m_boulder_agg_country = df_m_boulder_agg_country.sort_values('full_name', ascending=False)
df_m_boulder_agg_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
country | no. of climbers | avg. height | |
---|---|---|---|
54 | USA | 47 | 174.666667 |
27 | JPN | 44 | 171.555556 |
15 | FRA | 27 | 177.750000 |
18 | GER | 26 | 177.250000 |
7 | CAN | 26 | 172.750000 |
16 | GBR | 23 | 175.166667 |
2 | AUT | 20 | 178.000000 |
29 | KOR | 18 | 172.000000 |
47 | SUI | 16 | 171.800000 |
44 | RUS | 13 | 174.500000 |
9 | CHN | 12 | 173.000000 |
26 | ITA | 12 | 172.600000 |
23 | IRI | 12 | 173.000000 |
46 | SLO | 11 | 177.250000 |
13 | ESP | 11 | 178.250000 |
35 | MEX | 11 | NaN |
1 | AUS | 10 | 165.000000 |
25 | ISR | 9 | 171.250000 |
22 | IND | 9 | NaN |
37 | NED | 8 | 182.000000 |
4 | BEL | 7 | 174.000000 |
45 | SGP | 7 | NaN |
53 | UKR | 7 | 179.000000 |
8 | CHI | 6 | 180.000000 |
10 | CZE | 6 | 178.333333 |
41 | POL | 6 | 175.000000 |
19 | HKG | 6 | 172.000000 |
49 | SWE | 5 | 177.000000 |
14 | FIN | 5 | 180.000000 |
3 | AZE | 4 | NaN |
50 | THA | 4 | NaN |
31 | LTU | 4 | 176.000000 |
24 | IRL | 4 | 173.000000 |
5 | BRA | 4 | NaN |
17 | GEO | 4 | NaN |
52 | TUR | 3 | NaN |
39 | NOR | 3 | 175.000000 |
38 | NEP | 3 | NaN |
28 | KAZ | 3 | 170.000000 |
34 | MAS | 3 | NaN |
30 | LAT | 3 | 181.000000 |
20 | HUN | 3 | 176.000000 |
11 | DEN | 3 | NaN |
43 | RSA | 2 | 178.000000 |
6 | BUL | 2 | 172.000000 |
36 | MRI | 1 | 176.000000 |
40 | PER | 1 | NaN |
42 | PUR | 1 | NaN |
33 | MAC | 1 | NaN |
32 | LUX | 1 | NaN |
21 | INA | 1 | NaN |
48 | SVK | 1 | 175.000000 |
12 | ECU | 1 | 175.000000 |
51 | TPE | 1 | NaN |
0 | ARG | 1 | NaN |
Changes in the statistics over the years¶
years = list(range(2013, 2024))
years.pop(years.index(2020))
b_height_list = []
b_height_std_list = []
for year in years:
heights = df_m_boulder.loc[df_m_boulder['year'] == year]['height']
b_height_list.append(heights.mean())
b_height_std_list.append(heights.std())
avg = round(b_height_list[-1], 1)
std = round(b_height_std_list[-1], 1)
all_ = len(heights)
nans = len(heights.loc[heights.isnull()])
perc = round(100.0*nans/all_, 1)
print(f'Avg. height in {year}: {avg} +- {std} '+
f'({all_-nans} records, {nans} NaN, {perc}% of NaNs in {all_} climbers)')
Avg. height in 2013: 176.3 +- 5.1 (13 records, 102 NaN, 88.7% of NaNs in 115 climbers) Avg. height in 2014: 175.4 +- 5.6 (21 records, 118 NaN, 84.9% of NaNs in 139 climbers) Avg. height in 2015: 176.0 +- 5.3 (21 records, 61 NaN, 74.4% of NaNs in 82 climbers) Avg. height in 2016: 176.1 +- 5.3 (28 records, 91 NaN, 76.5% of NaNs in 119 climbers) Avg. height in 2017: 175.5 +- 5.4 (34 records, 77 NaN, 69.4% of NaNs in 111 climbers) Avg. height in 2018: 175.4 +- 5.5 (37 records, 53 NaN, 58.9% of NaNs in 90 climbers) Avg. height in 2019: 175.8 +- 5.8 (46 records, 39 NaN, 45.9% of NaNs in 85 climbers) Avg. height in 2021: 175.3 +- 6.5 (54 records, 17 NaN, 23.9% of NaNs in 71 climbers) Avg. height in 2022: 175.1 +- 5.8 (83 records, 70 NaN, 45.8% of NaNs in 153 climbers) Avg. height in 2023: 174.7 +- 6.1 (94 records, 65 NaN, 40.9% of NaNs in 159 climbers)
for year in years:
df_y = df_m_boulder.loc[df_m_boulder['year'] == year]
avg_rank = round(df_y['rank'].mean())
avg_rank_nan = round(df_y.loc[df_y['height'].isnull()]['rank'].mean())
avg_rank_not_nan = round(df_y.loc[df_y['height'].notnull()]['rank'].mean())
print(f'{year} avg. rank: {avg_rank}; avg. rank of climbers w/out height {avg_rank_nan},'+
f' and with height data {avg_rank_not_nan}')
2013 avg. rank: 58; avg. rank of climbers w/out height 60, and with height data 39 2014 avg. rank: 70; avg. rank of climbers w/out height 75, and with height data 38 2015 avg. rank: 41; avg. rank of climbers w/out height 48, and with height data 22 2016 avg. rank: 59; avg. rank of climbers w/out height 66, and with height data 39 2017 avg. rank: 55; avg. rank of climbers w/out height 63, and with height data 38 2018 avg. rank: 45; avg. rank of climbers w/out height 53, and with height data 33 2019 avg. rank: 43; avg. rank of climbers w/out height 51, and with height data 36 2021 avg. rank: 36; avg. rank of climbers w/out height 53, and with height data 30 2022 avg. rank: 77; avg. rank of climbers w/out height 97, and with height data 59 2023 avg. rank: 79; avg. rank of climbers w/out height 108, and with height data 60
outlier_marker = dict(markerfacecolor='none', marker='o', markeredgecolor='orangered', alpha=0.7)
meanprops = dict(markerfacecolor='none', marker='x', markeredgecolor='gray', alpha=0.9, markersize=4)
df_m_boulder.boxplot(column='height', by='year', flierprops=outlier_marker, showmeans=True, meanprops=meanprops)
ax = plt.gca()
ax.grid(linestyle='--', alpha=0.5)
ax.grid(axis='x')
ax.set_yticks(np.arange(160, 200, 5))
plt.suptitle('')
plt.title('Box plot of height distribution of male climbers registered\n'+
'in the IFSC Bouldering World Cup ranking for different years')
plt.xlabel('year')
plt.ylabel('height [cm]')
plt.show()
Differences between lead and boulder for men¶
mean = df_m_lead_agg['height'].mean()
std = df_m_lead_agg['height'].std()
bins = np.arange(np.floor(df_m_lead_agg['height'].min()), df_m_lead_agg['height'].max()+1, 1)
plt.hist(df_m_lead_agg['height'], bins=bins, color='green', zorder=10, alpha=0.5, label='lead')
plt.axvline(mean, color='black', lw=0.95, linestyle='-.')
mean_b = df_m_boulder_agg['height'].mean()
std_b = df_m_boulder_agg['height'].std()
bins_b = np.arange(np.floor(df_m_boulder_agg['height'].min()), df_m_boulder_agg['height'].max()+1, 1)
plt.hist(df_m_boulder_agg['height'], bins=bins_b, color='darkorchid', zorder=10, alpha=0.5, label='boulder')
plt.axvline(mean_b, color='black', lw=0.95, linestyle='--')
plt.legend(loc=2)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.64, 0.82, f'Avg. height is:\n{round(mean, 1)} $\pm$ {round(std, 1)} in lead\n'+
f'{round(mean_b, 1)} $\pm$ {round(std_b, 1)} in boulder', transform=ax.transAxes)
plt.title('Height distribution of male climbers registered in\nthe IFSC World Cup rankings between 2013 and 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
b_height_list = np.array(b_height_list)
b_height_std_list = np.array(b_height_std_list)
plt.plot(years, b_height_list, color='darkorchid', lw=1.8, label='boulder')
plt.fill_between(years, b_height_list - b_height_std_list, b_height_list + b_height_std_list,
color='darkorchid', lw=0, alpha=0.2)
l_height_list = np.array(l_height_list)
l_height_std_list = np.array(l_height_std_list)
plt.plot(years, l_height_list, color='green', lw=1.8, label='lead')
plt.fill_between(years, l_height_list - l_height_std_list, l_height_list + l_height_std_list,
color='green', lw=0, alpha=0.2)
plt.xlim([min(years), max(years)])
plt.ylim([164, 185.5])
plt.legend()
ax = plt.gca()
ax.set_xticks(years)
ax.set_yticks(np.arange(165, 186, 2))
plt.title('Comparison of the avg. height of male climbers\nin lead and bouldering IFSC rankings over the years')
plt.xlabel('year')
plt.ylabel('height [cm]')
plt.show()
Jump in height from 2018 to 2019 for lead¶
df_m_lead_18_19 = df_m_lead.loc[(df_m_lead['year'].isin([2018, 2019])) & (df_m_lead['height'].notnull())]
df_m_lead_18_19 = df_m_lead_18_19.groupby(['full_name'], as_index=False)
df_m_lead_18_19 = df_m_lead_18_19.agg({'full_name': 'first', 'country': 'first',
'year': 'sum', 'height': 'mean'})
df_m_lead_18 = df_m_lead_18_19.loc[df_m_lead_18_19['year'] == 2018]
df_m_lead_19 = df_m_lead_18_19.loc[df_m_lead_18_19['year'] == 2019]
print(f'Avg. height of climbers competeing in 2018 but not in 2019: {df_m_lead_18["height"].mean()}')
df_m_lead_18.sort_values('height')
Avg. height of climbers competeing in 2018 but not in 2019: 172.45454545454547
full_name | country | year | height | |
---|---|---|---|---|
17 | hyunbin min | KOR | 2018 | 162.0 |
61 | veddriq leonardo | INA | 2018 | 162.0 |
32 | marcin dzienski | POL | 2018 | 166.0 |
21 | jeremy bonder | FRA | 2018 | 168.0 |
35 | masahiro higuchi | JPN | 2018 | 169.0 |
63 | yoshiyuki ogata | JPN | 2018 | 172.0 |
37 | max kleesattel | GER | 2018 | 173.0 |
49 | romaric geffroy | FRA | 2018 | 177.0 |
24 | john brosler | USA | 2018 | 179.0 |
6 | arsène duval | FRA | 2018 | 181.0 |
29 | louis gundolf | AUT | 2018 | 188.0 |
print(f'Avg. height of climbers competeing in 2019 but not in 2018: {df_m_lead_19["height"].mean()}')
df_m_lead_19.sort_values('height')
Avg. height of climbers competeing in 2019 but not in 2018: 177.35
full_name | country | year | height | |
---|---|---|---|---|
45 | nimrod marcus | ISR | 2019 | 166.0 |
66 | zach galla | USA | 2019 | 168.0 |
57 | sungsu lee | KOR | 2019 | 170.0 |
10 | dmitrii fakirianov | RUS | 2019 | 171.0 |
48 | rei sugimoto | JPN | 2019 | 172.0 |
64 | yufei pan | CHN | 2019 | 173.0 |
47 | philipp martin | GER | 2019 | 173.0 |
11 | dohyun lee | KOR | 2019 | 174.0 |
60 | tomoaki takata | JPN | 2019 | 175.0 |
5 | anze peharc | SLO | 2019 | 177.0 |
25 | jongwon chon | KOR | 2019 | 177.0 |
34 | martin stranik | CZE | 2019 | 178.0 |
40 | mickael mawem | FRA | 2019 | 179.0 |
23 | jesse grupper | USA | 2019 | 180.0 |
4 | alistair duval | FRA | 2019 | 180.0 |
7 | campbell harrison | AUS | 2019 | 182.0 |
43 | nathaniel coleman | USA | 2019 | 182.0 |
41 | mikel asier linacisoro molina | ESP | 2019 | 184.0 |
38 | meichi narasaki | JPN | 2019 | 188.0 |
46 | paul jenft | FRA | 2019 | 198.0 |
Jump in height from 2018 to 2019 for bouldering¶
df_m_boulder_18_19 = df_m_boulder.loc[(df_m_boulder['year'].isin([2018, 2019])) & (df_m_boulder['height'].notnull())]
df_m_boulder_18_19 = df_m_boulder_18_19.groupby(['full_name'], as_index=False)
df_m_boulder_18_19 = df_m_boulder_18_19.agg({'full_name': 'first', 'country': 'first',
'year': 'sum', 'height': 'mean'})
df_m_boulder_18 = df_m_boulder_18_19.loc[df_m_boulder_18_19['year'] == 2018]
df_m_boulder_19 = df_m_boulder_18_19.loc[df_m_boulder_18_19['year'] == 2019]
print(f'Avg. height of climbers competeing in 2018 but not in 2019: {df_m_boulder_18["height"].mean()}')
df_m_boulder_18.sort_values('height')
Avg. height of climbers competeing in 2018 but not in 2019: 173.0
full_name | country | year | height | |
---|---|---|---|---|
32 | nimrod marcus | ISR | 2018 | 166.0 |
22 | masahiro higuchi | JPN | 2018 | 169.0 |
42 | thilo jeldrik schröter | NOR | 2018 | 175.0 |
31 | nils favre | SUI | 2018 | 176.0 |
49 | zan lovenjak sudar | SLO | 2018 | 179.0 |
print(f'Avg. height of climbers competeing in 2019 but not in 2018: {df_m_boulder_19["height"].mean()}')
df_m_boulder_19.sort_values('height')
Avg. height of climbers competeing in 2019 but not in 2018: 175.92857142857142
full_name | country | year | height | |
---|---|---|---|---|
40 | simon lorenzi | BEL | 2019 | 168.0 |
48 | zach galla | USA | 2019 | 168.0 |
41 | stefano ghisolfi | ITA | 2019 | 169.0 |
34 | ram levin | ISR | 2019 | 171.0 |
35 | rei kawamata | JPN | 2019 | 172.0 |
29 | nicolai uznik | AUT | 2019 | 173.0 |
33 | philipp martin | GER | 2019 | 173.0 |
5 | carlos felipe granja lopez | ECU | 2019 | 175.0 |
18 | luka potocar | SLO | 2019 | 177.0 |
1 | alberto ginés lópez | ESP | 2019 | 178.0 |
50 | zander waller | USA | 2019 | 182.0 |
27 | mikel asier linacisoro molina | ESP | 2019 | 184.0 |
0 | adam ondra | CZE | 2019 | 186.0 |
7 | christoph schweiger | GER | 2019 | 187.0 |
Basic statistics for both categories for women¶
df_cup_f = df_cup.loc[df_cup['sex'] == 'F']
df_cup_f.info()
<class 'pandas.core.frame.DataFrame'> Index: 2172 entries, 313 to 4357 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 2172 non-null int64 1 category 2172 non-null object 2 sex 2172 non-null object 3 full_name 2172 non-null object 4 height 928 non-null float64 5 country 2172 non-null object 6 rank 2172 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 135.8+ KB
column = df_cup_f.copy()['category']
df_cup_f.insert(0, 'category_dup', column, allow_duplicates=False)
df_cup_f_agg = df_cup_f.groupby(['full_name'], as_index=False)
df_cup_f_agg = df_cup_f_agg.agg({'full_name': 'first', 'country': 'first', 'year': 'count',
'category': lambda x: (x == 'lead').sum(),
'category_dup': lambda x: (x == 'boulder').sum(),
'height': 'mean', 'rank': 'mean'})
df_cup_f_agg = df_cup_f_agg.sort_values('year', ascending=False)
df_cup_f_agg = df_cup_f_agg.rename(columns={'year': 'times in ranking', 'rank' : 'avg. rank',
'category': 'in lead', 'category_dup': 'in boulder'})
df_cup_f_agg.info()
<class 'pandas.core.frame.DataFrame'> Index: 687 entries, 116 to 513 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_name 687 non-null object 1 country 687 non-null object 2 times in ranking 687 non-null int64 3 in lead 687 non-null int64 4 in boulder 687 non-null int64 5 height 159 non-null float64 6 avg. rank 687 non-null float64 dtypes: float64(2), int64(3), object(2) memory usage: 42.9+ KB
df_cup_f_agg.head(20)
full_name | country | times in ranking | in lead | in boulder | height | avg. rank | |
---|---|---|---|---|---|---|---|
116 | chloe caulier | BEL | 17 | 7 | 10 | 166.0 | 49.764706 |
314 | katja debevec | SLO | 17 | 8 | 9 | 173.0 | 37.705882 |
280 | jessica pilz | AUT | 17 | 10 | 7 | 165.0 | 14.529412 |
441 | miho nonaka | JPN | 16 | 7 | 9 | 163.0 | 16.687500 |
8 | akiyo noguchi | JPN | 16 | 8 | 8 | 167.0 | 8.687500 |
189 | fanny gibert | FRA | 15 | 5 | 10 | 165.0 | 20.666667 |
291 | julia chanourdie | FRA | 15 | 10 | 5 | 164.0 | 34.733333 |
271 | janja garnbret | SLO | 15 | 8 | 7 | 164.0 | 4.533333 |
35 | andrea kümin | SUI | 15 | 7 | 8 | 164.0 | 57.400000 |
590 | sol sa | KOR | 15 | 6 | 9 | NaN | 32.333333 |
460 | molly thompson-smith | GBR | 14 | 10 | 4 | 159.0 | 42.714286 |
328 | kyra condie | USA | 14 | 5 | 9 | 162.0 | 41.642857 |
518 | petra klingler | SUI | 14 | 4 | 10 | 162.0 | 26.000000 |
59 | anne-sophie koller | SUI | 14 | 10 | 4 | 160.0 | 55.214286 |
247 | ievgeniia kazbekova | UKR | 14 | 8 | 6 | 164.0 | 30.928571 |
267 | jain kim | KOR | 13 | 8 | 5 | 152.0 | 16.000000 |
650 | vita lukan | SLO | 12 | 7 | 5 | 164.0 | 24.333333 |
598 | stasa gejo | SRB | 12 | 5 | 7 | 175.0 | 28.250000 |
95 | brooke raboutou | USA | 12 | 6 | 6 | 158.0 | 26.333333 |
432 | mia krampl | SLO | 12 | 7 | 5 | 163.0 | 25.083333 |
df_cup_f_agg.describe()
times in ranking | in lead | in boulder | height | avg. rank | |
---|---|---|---|---|---|
count | 687.000000 | 687.000000 | 687.000000 | 159.000000 | 687.000000 |
mean | 3.161572 | 1.500728 | 1.660844 | 163.270440 | 73.178412 |
std | 3.124031 | 2.002094 | 1.921179 | 5.880331 | 32.171613 |
min | 1.000000 | 0.000000 | 0.000000 | 149.000000 | 4.533333 |
25% | 1.000000 | 0.000000 | 0.000000 | 160.000000 | 51.298611 |
50% | 2.000000 | 1.000000 | 1.000000 | 163.000000 | 72.000000 |
75% | 4.000000 | 2.000000 | 2.000000 | 167.500000 | 94.000000 |
max | 17.000000 | 10.000000 | 10.000000 | 181.000000 | 158.000000 |
df_cup_f_agg_country = df_cup_f_agg.groupby(['country'], as_index=False)
df_cup_f_agg_country = df_cup_f_agg_country.agg({'full_name': 'count', 'height': 'mean'})
df_cup_f_agg_country = df_cup_f_agg_country.sort_values('full_name', ascending=False)
df_cup_f_agg_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
df_cup_f_agg_country.head(30)
country | full_name | height | |
---|---|---|---|
61 | USA | 67 | 164.857143 |
33 | JPN | 58 | 160.055556 |
18 | FRA | 48 | 162.222222 |
8 | CAN | 33 | 165.000000 |
21 | GER | 31 | 164.777778 |
35 | KOR | 31 | 160.625000 |
50 | RUS | 30 | 163.250000 |
2 | AUT | 30 | 165.900000 |
32 | ITA | 29 | 162.571429 |
19 | GBR | 27 | 165.000000 |
52 | SLO | 25 | 164.307692 |
10 | CHN | 18 | 161.333333 |
54 | SUI | 18 | 161.833333 |
27 | INA | 17 | 155.500000 |
12 | CZE | 16 | 162.000000 |
43 | NOR | 16 | 164.000000 |
1 | AUS | 14 | 171.333333 |
28 | IND | 13 | NaN |
47 | POL | 12 | 168.000000 |
29 | IRI | 11 | 162.000000 |
4 | BEL | 10 | 162.500000 |
60 | UKR | 9 | 159.000000 |
16 | ESP | 9 | NaN |
51 | SGP | 8 | 169.000000 |
31 | ISR | 8 | 166.500000 |
38 | MEX | 7 | NaN |
56 | SWE | 7 | NaN |
17 | FIN | 6 | NaN |
5 | BRA | 6 | 164.000000 |
41 | NED | 6 | 165.500000 |
Analysis of the lead world cup data for women¶
df_f_lead = df_cup.loc[(df_cup['sex'] == 'F') & (df_cup['category'] == 'lead')]
df_f_lead.info()
<class 'pandas.core.frame.DataFrame'> Index: 1031 entries, 313 to 4239 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 1031 non-null int64 1 category 1031 non-null object 2 sex 1031 non-null object 3 full_name 1031 non-null object 4 height 489 non-null float64 5 country 1031 non-null object 6 rank 1031 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 64.4+ KB
df_f_lead.describe()
# note that in the numbers below a climber who was ranked for several years
# is taken into account that many times in the calculations
year | height | rank | |
---|---|---|---|
count | 1031.000000 | 489.000000 | 1031.000000 |
mean | 2018.260912 | 162.188139 | 54.704171 |
std | 3.422203 | 5.565683 | 34.858978 |
min | 2013.000000 | 149.000000 | 1.000000 |
25% | 2015.000000 | 159.000000 | 26.000000 |
50% | 2018.000000 | 162.000000 | 52.000000 |
75% | 2022.000000 | 165.000000 | 77.000000 |
max | 2023.000000 | 176.000000 | 150.000000 |
Analysis of aggregated lead rankings for all years¶
df_f_lead_agg = df_f_lead.groupby(['full_name'], as_index=False)
df_f_lead_agg = df_f_lead_agg.agg({'full_name': 'first', 'country': 'first', 'year': 'count',
'height': 'mean', 'rank': 'mean'})
df_f_lead_agg = df_f_lead_agg.sort_values('year', ascending=False)
df_f_lead_agg = df_f_lead_agg.rename(columns={'year': 'years in ranking', 'rank' : 'avg. rank'})
df_f_lead_agg.info()
<class 'pandas.core.frame.DataFrame'> Index: 427 entries, 349 to 426 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_name 427 non-null object 1 country 427 non-null object 2 years in ranking 427 non-null int64 3 height 130 non-null float64 4 avg. rank 427 non-null float64 dtypes: float64(2), int64(1), object(2) memory usage: 20.0+ KB
df_f_lead_agg.head(20)
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
349 | salomé romain | FRA | 10 | 149.0 | 23.200000 |
171 | jessica pilz | AUT | 10 | 165.0 | 11.200000 |
179 | julia chanourdie | FRA | 10 | 164.0 | 23.600000 |
283 | molly thompson-smith | GBR | 10 | 159.0 | 30.300000 |
68 | claudia ghisolfi | ITA | 10 | 162.0 | 40.900000 |
32 | anne-sophie koller | SUI | 10 | 160.0 | 45.700000 |
181 | julia fiser | AUT | 9 | 158.0 | 51.222222 |
384 | tina johnsen hafsaas | NOR | 9 | 164.0 | 43.333333 |
143 | hélène janicot | FRA | 8 | 165.0 | 11.875000 |
270 | michelle hulliger | SUI | 8 | 157.0 | 50.500000 |
161 | jain kim | KOR | 8 | 152.0 | 3.250000 |
146 | ievgeniia kazbekova | UKR | 8 | 164.0 | 30.750000 |
339 | risa ota | JPN | 8 | 161.0 | 27.750000 |
192 | katja debevec | SLO | 8 | 173.0 | 46.625000 |
164 | janja garnbret | SLO | 8 | 164.0 | 2.000000 |
6 | akiyo noguchi | JPN | 8 | 167.0 | 14.375000 |
401 | vita lukan | SLO | 7 | 164.0 | 15.714286 |
234 | magdalena röck | AUT | 7 | NaN | 30.428571 |
17 | anak verhoeven | BEL | 7 | NaN | 8.428571 |
275 | mina markovic | SLO | 7 | 161.0 | 7.285714 |
df_f_lead_agg.describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 427.000000 | 130.000000 | 427.000000 |
mean | 2.414520 | 162.869231 | 68.605569 |
std | 2.059918 | 5.542491 | 32.570639 |
min | 1.000000 | 149.000000 | 2.000000 |
25% | 1.000000 | 160.000000 | 45.850000 |
50% | 2.000000 | 163.000000 | 68.000000 |
75% | 3.000000 | 167.000000 | 87.500000 |
max | 10.000000 | 176.000000 | 150.000000 |
df_f_lead_agg.loc[df_f_lead_agg['years in ranking'] >= 5].describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 64.000000 | 42.000000 | 64.000000 |
mean | 6.656250 | 162.000000 | 36.275961 |
std | 1.545282 | 5.512735 | 18.338856 |
min | 5.000000 | 149.000000 | 2.000000 |
25% | 5.000000 | 159.000000 | 23.500000 |
50% | 6.000000 | 162.500000 | 34.500000 |
75% | 7.250000 | 165.000000 | 50.000000 |
max | 10.000000 | 175.000000 | 75.666667 |
df_f_lead_agg.loc[df_f_lead_agg['height'] > 172]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
192 | katja debevec | SLO | 8 | 173.0 | 46.625 |
371 | stasa gejo | SRB | 5 | 175.0 | 52.800 |
314 | oceania mackenzie | AUS | 4 | 173.0 | 58.250 |
180 | julia duffy | USA | 2 | 176.0 | 60.000 |
184 | julija kruder | SLO | 2 | 175.0 | 96.000 |
108 | flavy cohaut | FRA | 1 | 174.0 | 128.000 |
df_f_lead_agg.loc[df_f_lead_agg['height'] < 153]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
349 | salomé romain | FRA | 10 | 149.0 | 23.200000 |
161 | jain kim | KOR | 8 | 152.0 | 3.250000 |
74 | dinara fakhritdinova | RUS | 7 | 152.0 | 17.571429 |
206 | laura rogora | ITA | 6 | 152.0 | 15.166667 |
336 | rebeka kamin | SLO | 4 | 152.0 | 42.250000 |
297 | natsuki tanii | JPN | 4 | 152.0 | 5.750000 |
280 | miu kakizaki | JPN | 3 | 149.0 | 49.000000 |
mean = df_f_lead_agg['height'].mean()
std = df_f_lead_agg['height'].std()
bins = np.arange(np.floor(df_f_lead_agg['height'].min()), df_f_lead_agg['height'].max()+1, 1)
plt.hist(df_f_lead_agg['height'], bins=bins, color='tomato', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.05, 0.87, f'Avg. height\nis {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of female climbers registered in\nthe IFSC '+
'Lead World Cup ranking between 2013 and 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
df_f_lead_agg_clean = df_f_lead_agg.loc[(df_f_lead_agg['height'].notnull())]
stats.normaltest(df_f_lead_agg_clean['height'])
NormaltestResult(statistic=0.2331952749381621, pvalue=0.8899432009328268)
stats.ks_1samp(df_f_lead_agg_clean['height'], stats.norm.cdf, args=(world_f_height_avg, world_f_height_std))
KstestResult(statistic=0.20615375784824586, pvalue=2.533600800064976e-05, statistic_location=165.0, statistic_sign=1)
stats.ttest_1samp(df_f_lead_agg_clean['height'], world_f_height_avg)
TtestResult(statistic=-3.766173501894482, pvalue=0.0002508890927849896, df=129)
df_f_lead_agg_country = df_f_lead_agg.groupby(['country'], as_index=False)
df_f_lead_agg_country = df_f_lead_agg_country.agg({'full_name': 'count', 'height': 'mean'})
df_f_lead_agg_country = df_f_lead_agg_country.sort_values('full_name', ascending=False)
df_f_lead_agg_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
country | no. of climbers | avg. height | |
---|---|---|---|
25 | JPN | 36 | 159.800000 |
14 | FRA | 31 | 162.357143 |
47 | USA | 31 | 164.300000 |
27 | KOR | 25 | 161.200000 |
39 | SLO | 24 | 164.307692 |
16 | GER | 23 | 164.000000 |
37 | RUS | 22 | 160.666667 |
15 | GBR | 19 | 165.000000 |
24 | ITA | 19 | 159.200000 |
8 | CHN | 17 | 161.333333 |
2 | AUT | 17 | 164.625000 |
19 | INA | 16 | 155.500000 |
41 | SUI | 15 | 161.833333 |
9 | CZE | 12 | 162.000000 |
6 | CAN | 11 | 163.000000 |
31 | NOR | 11 | 164.000000 |
3 | BEL | 8 | 162.500000 |
34 | POL | 7 | 168.000000 |
1 | AUS | 7 | 166.500000 |
23 | ISR | 6 | 166.500000 |
7 | CHI | 5 | 160.666667 |
12 | ESP | 5 | NaN |
4 | BRA | 5 | 164.000000 |
30 | NED | 4 | 165.500000 |
43 | SWE | 4 | NaN |
17 | HKG | 4 | NaN |
44 | THA | 4 | NaN |
46 | UKR | 3 | 159.000000 |
32 | NZL | 3 | NaN |
0 | ARG | 3 | 157.000000 |
20 | IND | 3 | NaN |
28 | MEX | 3 | NaN |
42 | SVK | 2 | 169.000000 |
45 | TPE | 2 | 167.000000 |
11 | ECU | 2 | NaN |
36 | RSA | 2 | NaN |
21 | IRI | 2 | 161.000000 |
38 | SGP | 2 | NaN |
40 | SRB | 2 | 175.000000 |
5 | BUL | 1 | 162.000000 |
10 | DEN | 1 | NaN |
13 | FIN | 1 | NaN |
29 | MKD | 1 | NaN |
18 | HUN | 1 | NaN |
35 | PUR | 1 | NaN |
22 | ISL | 1 | 171.000000 |
33 | PER | 1 | 168.000000 |
26 | KAZ | 1 | NaN |
48 | VEN | 1 | NaN |
Changes in the statistics over the years¶
years = list(range(2013, 2024))
years.pop(years.index(2020))
lf_height_list = []
lf_height_std_list = []
for year in years:
heights = df_f_lead.loc[df_f_lead['year'] == year]['height']
lf_height_list.append(heights.mean())
lf_height_std_list.append(heights.std())
avg = round(lf_height_list[-1], 1)
std = round(lf_height_std_list[-1], 1)
all_ = len(heights)
nans = len(heights.loc[heights.isnull()])
perc = round(100.0*nans/all_, 1)
print(f'Avg. height in {year}: {avg} +- {std} '+
f'({all_-nans} records, {nans} NaN, {perc}% of NaNs in {all_} climbers)')
Avg. height in 2013: 161.3 +- 5.9 (19 records, 77 NaN, 80.2% of NaNs in 96 climbers) Avg. height in 2014: 160.9 +- 6.7 (21 records, 78 NaN, 78.8% of NaNs in 99 climbers) Avg. height in 2015: 161.0 +- 6.1 (22 records, 62 NaN, 73.8% of NaNs in 84 climbers) Avg. height in 2016: 161.2 +- 5.6 (33 records, 49 NaN, 59.8% of NaNs in 82 climbers) Avg. height in 2017: 162.2 +- 5.6 (44 records, 64 NaN, 59.3% of NaNs in 108 climbers) Avg. height in 2018: 162.7 +- 5.8 (49 records, 51 NaN, 51.0% of NaNs in 100 climbers) Avg. height in 2019: 161.8 +- 5.0 (55 records, 30 NaN, 35.3% of NaNs in 85 climbers) Avg. height in 2021: 162.3 +- 5.4 (67 records, 11 NaN, 14.1% of NaNs in 78 climbers) Avg. height in 2022: 162.7 +- 5.6 (86 records, 59 NaN, 40.7% of NaNs in 145 climbers) Avg. height in 2023: 162.7 +- 5.3 (93 records, 61 NaN, 39.6% of NaNs in 154 climbers)
for year in years:
df_y = df_f_lead.loc[df_f_lead['year'] == year]
avg_rank = round(df_y['rank'].mean())
avg_rank_nan = round(df_y.loc[df_y['height'].isnull()]['rank'].mean())
avg_rank_not_nan = round(df_y.loc[df_y['height'].notnull()]['rank'].mean())
print(f'{year} avg. rank: {avg_rank}; avg. rank of climbers w/out height {avg_rank_nan},'+
f' and with height data {avg_rank_not_nan}')
2013 avg. rank: 48; avg. rank of climbers w/out height 51, and with height data 35 2014 avg. rank: 50; avg. rank of climbers w/out height 53, and with height data 39 2015 avg. rank: 42; avg. rank of climbers w/out height 47, and with height data 28 2016 avg. rank: 41; avg. rank of climbers w/out height 43, and with height data 39 2017 avg. rank: 54; avg. rank of climbers w/out height 66, and with height data 38 2018 avg. rank: 50; avg. rank of climbers w/out height 58, and with height data 41 2019 avg. rank: 43; avg. rank of climbers w/out height 55, and with height data 36 2021 avg. rank: 39; avg. rank of climbers w/out height 63, and with height data 35 2022 avg. rank: 73; avg. rank of climbers w/out height 105, and with height data 50 2023 avg. rank: 77; avg. rank of climbers w/out height 107, and with height data 57
outlier_marker = dict(markerfacecolor='none', marker='o', markeredgecolor='orangered', alpha=0.7)
meanprops = dict(markerfacecolor='none', marker='x', markeredgecolor='gray', alpha=0.9, markersize=4)
boxprops=dict(color='darkslateblue')
whiskerprops=boxprops
df_f_lead.boxplot(column='height', by='year', flierprops=outlier_marker, showmeans=True,
meanprops=meanprops, boxprops=boxprops, whiskerprops=whiskerprops)
ax = plt.gca()
ax.grid(linestyle='--', alpha=0.5)
ax.grid(axis='x')
ax.set_yticks(np.arange(149, 178, 3))
plt.suptitle('')
plt.title('Box plot of height distribution of female climbers registered\n'+
'in the IFSC Lead World Cup ranking for different years')
plt.xlabel('year')
plt.ylabel('height [cm]')
plt.show()
Analysis of the bouldring world cup data for women¶
df_f_boulder = df_cup.loc[(df_cup['sex'] == 'F') & (df_cup['category'] == 'boulder')]
df_f_boulder.info()
<class 'pandas.core.frame.DataFrame'> Index: 1141 entries, 458 to 4357 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 1141 non-null int64 1 category 1141 non-null object 2 sex 1141 non-null object 3 full_name 1141 non-null object 4 height 439 non-null float64 5 country 1141 non-null object 6 rank 1141 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 71.3+ KB
df_f_boulder.describe()
# note that in the numbers below a climber who was ranked for several years
# is taken into account that many times in the calculations
year | height | rank | |
---|---|---|---|
count | 1141.000000 | 439.000000 | 1141.000000 |
mean | 2017.991236 | 164.576310 | 60.514461 |
std | 3.453439 | 5.567599 | 37.891269 |
min | 2013.000000 | 150.000000 | 1.000000 |
25% | 2015.000000 | 161.000000 | 29.000000 |
50% | 2018.000000 | 164.000000 | 58.000000 |
75% | 2022.000000 | 168.000000 | 88.000000 |
max | 2023.000000 | 181.000000 | 158.000000 |
Analysis of aggregated bouldering rankings for all years¶
df_f_boulder_agg = df_f_boulder.groupby(['full_name'], as_index=False)
df_f_boulder_agg = df_f_boulder_agg.agg({'full_name': 'first', 'country': 'first', 'year': 'count',
'height': 'mean', 'rank': 'mean'})
df_f_boulder_agg = df_f_boulder_agg.sort_values('year', ascending=False)
df_f_boulder_agg = df_f_boulder_agg.rename(columns={'year': 'years in ranking', 'rank' : 'avg. rank'})
df_f_boulder_agg.info()
<class 'pandas.core.frame.DataFrame'> Index: 487 entries, 148 to 243 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_name 487 non-null object 1 country 487 non-null object 2 years in ranking 487 non-null int64 3 height 133 non-null float64 4 avg. rank 487 non-null float64 dtypes: float64(2), int64(1), object(2) memory usage: 22.8+ KB
df_f_boulder_agg.head(20)
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
148 | fanny gibert | FRA | 10 | 165.0 | 13.500000 |
366 | petra klingler | SUI | 10 | 162.0 | 13.800000 |
97 | chloe caulier | BEL | 10 | 166.0 | 33.400000 |
217 | julija kruder | SLO | 10 | 175.0 | 47.500000 |
420 | sol sa | KOR | 9 | NaN | 20.666667 |
319 | miho nonaka | JPN | 9 | 163.0 | 4.333333 |
237 | kyra condie | USA | 9 | 162.0 | 42.111111 |
228 | katja debevec | SLO | 9 | 173.0 | 29.777778 |
344 | natalie bärtschi | SUI | 9 | NaN | 72.000000 |
29 | andrea kümin | SUI | 8 | 164.0 | 56.250000 |
21 | alma bestvater | GER | 8 | 161.0 | 53.000000 |
408 | shauna coxsey | GBR | 8 | 164.0 | 11.375000 |
8 | akiyo noguchi | JPN | 8 | 167.0 | 3.000000 |
210 | johanna färber | AUT | 7 | 171.0 | 36.000000 |
158 | franziska sterrer | AUT | 7 | 169.0 | 30.857143 |
120 | ekaterina kipriianova | RUS | 7 | NaN | 39.000000 |
207 | jessica pilz | AUT | 7 | 165.0 | 19.285714 |
419 | sofya yokoyama | SUI | 7 | NaN | 59.714286 |
384 | rong jiang | CHN | 7 | NaN | 58.714286 |
184 | hung ying lee | TPE | 7 | 170.0 | 53.142857 |
df_f_boulder_agg.describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 487.000000 | 133.000000 | 487.000000 |
mean | 2.342916 | 163.947368 | 75.153275 |
std | 1.899606 | 5.560025 | 35.488408 |
min | 1.000000 | 150.000000 | 3.000000 |
25% | 1.000000 | 160.000000 | 51.000000 |
50% | 2.000000 | 164.000000 | 73.000000 |
75% | 3.000000 | 168.000000 | 100.000000 |
max | 10.000000 | 181.000000 | 158.000000 |
df_f_boulder_agg.loc[df_f_boulder_agg['years in ranking'] >= 5].describe()
years in ranking | height | avg. rank | |
---|---|---|---|
count | 66.000000 | 37.000000 | 66.000000 |
mean | 6.378788 | 165.324324 | 39.960528 |
std | 1.526838 | 5.701141 | 23.224164 |
min | 5.000000 | 152.000000 | 3.000000 |
25% | 5.000000 | 162.000000 | 23.850000 |
50% | 6.000000 | 164.000000 | 36.300000 |
75% | 7.000000 | 168.000000 | 55.500000 |
max | 10.000000 | 181.000000 | 97.000000 |
df_f_boulder_agg.loc[df_f_boulder_agg['height'] > 172]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
217 | julija kruder | SLO | 10 | 175.0 | 47.500000 |
228 | katja debevec | SLO | 9 | 173.0 | 29.777778 |
423 | stasa gejo | SRB | 7 | 175.0 | 10.714286 |
154 | flavy cohaut | FRA | 5 | 174.0 | 36.200000 |
137 | emma horan | AUS | 5 | 181.0 | 80.200000 |
357 | oceania mackenzie | AUS | 4 | 173.0 | 32.000000 |
411 | sienna kopf | USA | 3 | 174.0 | 47.666667 |
df_f_boulder_agg.loc[df_f_boulder_agg['height'] < 154]
full_name | country | years in ranking | height | avg. rank | |
---|---|---|---|---|---|
199 | jain kim | KOR | 5 | 152.0 | 36.4 |
241 | laura rogora | ITA | 4 | 152.0 | 31.0 |
116 | dinara fakhritdinova | RUS | 2 | 152.0 | 22.5 |
346 | natsuki tanii | JPN | 1 | 152.0 | 51.0 |
90 | chaeyeong kim | KOR | 1 | 150.0 | 128.0 |
mean = df_f_boulder_agg['height'].mean()
std = df_f_boulder_agg['height'].std()
bins = np.arange(np.floor(df_f_boulder_agg['height'].min()), df_f_boulder_agg['height'].max()+1, 1)
plt.hist(df_f_boulder_agg['height'], bins=bins, color='tomato', zorder=10, alpha=0.9)
plt.axvline(mean, color='black', lw=0.9)
plt.axvline(mean-std, linestyle='--', color='black', lw=0.9)
plt.axvline(mean+std, linestyle='--', color='black', lw=0.9)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.05, 0.87, f'Avg. height\nis {round(mean, 1)} $\pm$ {round(std, 1)}', transform=ax.transAxes)
plt.title('Height distribution of female climbers registered in the IFSC\n'+
'Bouldering World Cup ranking between 2013 and 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
df_f_boulder_agg_clean = df_f_boulder_agg.loc[(df_f_boulder_agg['height'].notnull())
& (df_f_boulder_agg['height'] < 181)]
stats.normaltest(df_f_boulder_agg_clean['height'])
NormaltestResult(statistic=1.2085361042148373, pvalue=0.5464742709802409)
stats.ks_1samp(df_f_boulder_agg_clean['height'], stats.norm.cdf, args=(world_f_height_avg, world_f_height_std))
KstestResult(statistic=0.14216774386223185, pvalue=0.00862427465183941, statistic_location=165.0, statistic_sign=1)
stats.ttest_1samp(df_f_boulder_agg_clean['height'], world_f_height_avg)
TtestResult(statistic=-1.8841588901953716, pvalue=0.06175961716324531, df=131)
df_f_boulder_agg_country = df_f_boulder_agg.groupby(['country'], as_index=False)
df_f_boulder_agg_country = df_f_boulder_agg_country.agg({'full_name': 'count', 'height': 'mean'})
df_f_boulder_agg_country = df_f_boulder_agg_country.sort_values('full_name', ascending=False)
df_f_boulder_agg_country.rename(columns={'full_name': 'no. of climbers', 'height': 'avg. height'})
df_f_boulder_agg_country.head(30)
country | full_name | height | |
---|---|---|---|
61 | USA | 58 | 164.000000 |
33 | JPN | 44 | 161.000000 |
18 | FRA | 32 | 163.538462 |
8 | CAN | 29 | 164.333333 |
2 | AUT | 25 | 166.777778 |
21 | GER | 20 | 165.142857 |
32 | ITA | 19 | 163.000000 |
19 | GBR | 19 | 165.000000 |
50 | RUS | 17 | 163.250000 |
35 | KOR | 15 | 160.625000 |
52 | SLO | 13 | 166.600000 |
28 | IND | 11 | NaN |
29 | IRI | 11 | 162.000000 |
1 | AUS | 11 | 177.000000 |
10 | CHN | 11 | 160.000000 |
54 | SUI | 10 | 162.800000 |
43 | NOR | 9 | 164.000000 |
12 | CZE | 8 | 162.000000 |
38 | MEX | 7 | NaN |
47 | POL | 7 | NaN |
31 | ISR | 7 | 166.500000 |
51 | SGP | 7 | 169.000000 |
60 | UKR | 7 | 160.666667 |
17 | FIN | 6 | NaN |
16 | ESP | 6 | NaN |
41 | NED | 5 | 165.500000 |
57 | THA | 5 | NaN |
5 | BRA | 5 | 164.000000 |
55 | SVK | 4 | 169.000000 |
56 | SWE | 4 | NaN |
Changes in the statistics over the years¶
years = list(range(2013, 2024))
years.pop(years.index(2020))
bf_height_list = []
bf_height_std_list = []
for year in years:
# Emma Horan increases the average by 1cm for 2014 and 2015, and 0.5cm for 2016-2018
heights = df_f_boulder.loc[df_f_boulder['year'] == year]['height']
bf_height_list.append(heights.mean())
bf_height_std_list.append(heights.std())
avg = round(bf_height_list[-1], 1)
std = round(bf_height_std_list[-1], 1)
all_ = len(heights)
nans = len(heights.loc[heights.isnull()])
perc = round(100.0*nans/all_, 1)
print(f'Avg. height in {year}: {avg} +- {std} '+
f'({all_-nans} records, {nans} NaN, {perc}% of NaNs in {all_} climbers)')
Avg. height in 2013: 162.6 +- 5.3 (17 records, 101 NaN, 85.6% of NaNs in 118 climbers) Avg. height in 2014: 164.3 +- 7.3 (18 records, 112 NaN, 86.2% of NaNs in 130 climbers) Avg. height in 2015: 167.6 +- 5.8 (16 records, 69 NaN, 81.2% of NaNs in 85 climbers) Avg. height in 2016: 165.6 +- 5.8 (30 records, 92 NaN, 75.4% of NaNs in 122 climbers) Avg. height in 2017: 165.6 +- 6.0 (36 records, 77 NaN, 68.1% of NaNs in 113 climbers) Avg. height in 2018: 165.6 +- 5.7 (50 records, 53 NaN, 51.5% of NaNs in 103 climbers) Avg. height in 2019: 163.7 +- 5.9 (55 records, 33 NaN, 37.5% of NaNs in 88 climbers) Avg. height in 2021: 164.6 +- 5.1 (53 records, 15 NaN, 22.1% of NaNs in 68 climbers) Avg. height in 2022: 164.4 +- 4.9 (74 records, 76 NaN, 50.7% of NaNs in 150 climbers) Avg. height in 2023: 163.8 +- 5.2 (90 records, 74 NaN, 45.1% of NaNs in 164 climbers)
for year in years:
df_y = df_f_boulder.loc[df_f_boulder['year'] == year]
avg_rank = round(df_y['rank'].mean())
avg_rank_nan = round(df_y.loc[df_y['height'].isnull()]['rank'].mean())
avg_rank_not_nan = round(df_y.loc[df_y['height'].notnull()]['rank'].mean())
print(f'{year} avg. rank: {avg_rank}; avg. rank of climbers w/out height {avg_rank_nan},'+
f' and with height data {avg_rank_not_nan}')
2013 avg. rank: 59; avg. rank of climbers w/out height 59, and with height data 57 2014 avg. rank: 65; avg. rank of climbers w/out height 68, and with height data 46 2015 avg. rank: 43; avg. rank of climbers w/out height 45, and with height data 35 2016 avg. rank: 61; avg. rank of climbers w/out height 65, and with height data 49 2017 avg. rank: 56; avg. rank of climbers w/out height 65, and with height data 37 2018 avg. rank: 52; avg. rank of climbers w/out height 61, and with height data 42 2019 avg. rank: 44; avg. rank of climbers w/out height 59, and with height data 35 2021 avg. rank: 34; avg. rank of climbers w/out height 56, and with height data 28 2022 avg. rank: 75; avg. rank of climbers w/out height 99, and with height data 50 2023 avg. rank: 82; avg. rank of climbers w/out height 107, and with height data 61
outlier_marker = dict(markerfacecolor='none', marker='o', markeredgecolor='orangered', alpha=0.7)
meanprops = dict(markerfacecolor='none', marker='x', markeredgecolor='gray', alpha=0.9, markersize=4)
boxprops=dict(color='darkslateblue')
whiskerprops=boxprops
df_f_boulder.boxplot(column='height', by='year', flierprops=outlier_marker, showmeans=True,
meanprops=meanprops, boxprops=boxprops, whiskerprops=whiskerprops)
ax = plt.gca()
ax.grid(linestyle='--', alpha=0.5)
ax.grid(axis='x')
ax.set_yticks(np.arange(151, 182, 3))
plt.suptitle('')
plt.title('Box plot of height distribution of female climbers registered\n'+
'in the IFSC Bouldering World Cup rankings for different years')
plt.xlabel('year')
plt.ylabel('height [cm]')
plt.show()
Differences between lead and boulder for women¶
mean = df_f_lead_agg['height'].mean()
std = df_f_lead_agg['height'].std()
bins = np.arange(np.floor(df_f_lead_agg['height'].min()), df_f_boulder_agg['height'].max()+2, 1)
plt.hist(df_f_lead_agg['height'], bins=bins, color='darkcyan', zorder=10, alpha=0.55, label='lead')
plt.axvline(mean, color='black', lw=0.95, linestyle='-.')
mean_b = df_f_boulder_agg['height'].mean()
std_b = df_f_boulder_agg['height'].std()
plt.hist(df_f_boulder_agg['height'], bins=bins, color='darkorange', zorder=10, alpha=0.55, label='boulder')
plt.axvline(mean_b, color='black', lw=0.95, linestyle='--')
plt.legend(loc=2)
ax = plt.gca()
ax.set_xticks(bins[::3])
plt.text(0.64, 0.82, f'Avg. height is:\n{round(mean, 1)} $\pm$ {round(std, 1)} in lead\n'+
f'{round(mean_b, 1)} $\pm$ {round(std_b, 1)} in boulder', transform=ax.transAxes)
plt.title('Height distribution of female climbers registered in\nthe IFSC World Cup rankings between 2013 and 2023')
plt.xlabel('height [cm]')
plt.ylabel('number of climbers')
plt.show()
bf_height_list = np.array(bf_height_list)
bf_height_std_list = np.array(bf_height_std_list)
plt.plot(years, bf_height_list, color='darkorange', lw=1.8, label='boulder')
plt.fill_between(years, bf_height_list - bf_height_std_list, bf_height_list + bf_height_std_list,
color='darkorange', lw=0, alpha=0.2)
lf_height_list = np.array(lf_height_list)
lf_height_std_list = np.array(lf_height_std_list)
plt.plot(years, lf_height_list, color='darkcyan', lw=1.8, label='lead')
plt.fill_between(years, lf_height_list - lf_height_std_list, lf_height_list + lf_height_std_list,
color='darkcyan', lw=0, alpha=0.2)
plt.xlim([min(years), max(years)])
plt.ylim([150.5, 176])
plt.legend()
ax = plt.gca()
ax.set_xticks(years)
ax.set_yticks(np.arange(151, 176, 2))
plt.title('Comparison of the avg. height of female climbers\nin lead and bouldering IFSC rankings over the years')
plt.xlabel('year')
plt.ylabel('height [cm]')
plt.show()
Drop in height from 2018 to 2019 for lead¶
df_f_lead_18_19 = df_f_lead.loc[(df_f_lead['year'].isin([2018, 2019])) & (df_f_lead['height'].notnull())]
df_f_lead_18_19 = df_f_lead_18_19.groupby(['full_name'], as_index=False)
df_f_lead_18_19 = df_f_lead_18_19.agg({'full_name': 'first', 'country': 'first',
'year': 'sum', 'height': 'mean'})
df_f_lead_18 = df_f_lead_18_19.loc[df_f_lead_18_19['year'] == 2018]
df_f_lead_19 = df_f_lead_18_19.loc[df_f_lead_18_19['year'] == 2019]
print(f'Avg. height of climbers ranked in 2018 but not in 2019: {df_f_lead_18["height"].mean()}')
df_f_lead_18.sort_values('height')
Avg. height of climbers ranked in 2018 but not in 2019: 165.58333333333334
full_name | country | year | height | |
---|---|---|---|---|
14 | dinara fakhritdinova | RUS | 2018 | 152.0 |
42 | manon hily | FRA | 2018 | 154.0 |
22 | gayeon cho | KOR | 2018 | 162.0 |
25 | hsiu-ju lin | TPE | 2018 | 164.0 |
23 | giorgia tesio | ITA | 2018 | 165.0 |
27 | hélène janicot | FRA | 2018 | 165.0 |
54 | patrycja chudziak | POL | 2018 | 168.0 |
62 | tjasa slemensek | SLO | 2018 | 168.0 |
7 | anouck jaubert | FRA | 2018 | 169.0 |
38 | laura stöckler | AUT | 2018 | 172.0 |
34 | katja debevec | SLO | 2018 | 173.0 |
60 | stasa gejo | SRB | 2018 | 175.0 |
print(f'Avg. height of climbers ranked in 2019 but not in 2018: {df_f_lead_19["height"].mean()}')
df_f_lead_19.sort_values('height')
Avg. height of climbers ranked in 2019 but not in 2018: 161.77777777777777
full_name | country | year | height | |
---|---|---|---|---|
49 | natsuki tanii | JPN | 2019 | 152.0 |
1 | ai mori | JPN | 2019 | 154.0 |
50 | nika potapova | UKR | 2019 | 154.0 |
65 | yuetong zhang | CHN | 2019 | 160.0 |
39 | lucinda ann turnbull | AUS | 2019 | 160.0 |
4 | alma bestvater | GER | 2019 | 161.0 |
17 | elnaz rekabi | IRI | 2019 | 161.0 |
63 | viktoriia meshkova | RUS | 2019 | 161.0 |
56 | risa ota | JPN | 2019 | 161.0 |
35 | kyra condie | USA | 2019 | 162.0 |
11 | chaehyun seo | KOR | 2019 | 163.0 |
3 | alannah yip | CAN | 2019 | 164.0 |
5 | andrea kümin | SUI | 2019 | 164.0 |
59 | shauna coxsey | GBR | 2019 | 164.0 |
0 | afra hönig | GER | 2019 | 165.0 |
19 | eva maria hammelmüller | AUT | 2019 | 167.0 |
15 | elena krasovskaia | RUS | 2019 | 169.0 |
26 | hung ying lee | TPE | 2019 | 170.0 |
Drop in height from 2018 to 2019 for bouldering¶
df_f_boulder_18_19 = df_f_boulder.loc[(df_f_boulder['year'].isin([2018, 2019])) & (df_f_boulder['height'].notnull())]
df_f_boulder_18_19 = df_f_boulder_18_19.groupby(['full_name'], as_index=False)
df_f_boulder_18_19 = df_f_boulder_18_19.agg({'full_name': 'first', 'country': 'first',
'year': 'sum', 'height': 'mean'})
df_f_boulder_18 = df_f_boulder_18_19.loc[df_f_boulder_18_19['year'] == 2018]
df_f_boulder_19 = df_f_boulder_18_19.loc[df_f_boulder_18_19['year'] == 2019]
print(f'Avg. height of climbers ranked in 2018 but not in 2019: {df_f_boulder_18["height"].mean()}')
df_f_boulder_18.sort_values('height')
Avg. height of climbers ranked in 2018 but not in 2019: 169.1818181818182
full_name | country | year | height | |
---|---|---|---|---|
47 | mina markovic | SLO | 2018 | 161.0 |
22 | giorgia tesio | ITA | 2018 | 165.0 |
43 | megan lynch | USA | 2018 | 167.0 |
42 | maya madere | USA | 2018 | 168.0 |
56 | saki kikuchi | JPN | 2018 | 168.0 |
61 | tjasa slemensek | SLO | 2018 | 168.0 |
20 | franziska sterrer | AUT | 2018 | 169.0 |
62 | vanessa si yinn teng | SGP | 2018 | 169.0 |
27 | isabel gifford | USA | 2018 | 170.0 |
60 | stasa gejo | SRB | 2018 | 175.0 |
17 | emma horan | AUS | 2018 | 181.0 |
print(f'Avg. height of climbers ranked in 2019 but not in 2018: {df_f_boulder_19["height"].mean()}')
df_f_boulder_19.sort_values('height')
Avg. height of climbers ranked in 2019 but not in 2018: 161.625
full_name | country | year | height | |
---|---|---|---|---|
36 | laura rogora | ITA | 2019 | 152.0 |
51 | natsuki tanii | JPN | 2019 | 152.0 |
1 | ai mori | JPN | 2019 | 154.0 |
12 | camilla moroni | ITA | 2019 | 157.0 |
48 | molly thompson-smith | GBR | 2019 | 159.0 |
4 | alejandra contreras | CHI | 2019 | 160.0 |
41 | mattea pötzi | AUT | 2019 | 160.0 |
65 | yuetong zhang | CHN | 2019 | 160.0 |
63 | viktoriia meshkova | RUS | 2019 | 161.0 |
55 | roxana wienand | GER | 2019 | 162.0 |
52 | naïlé meignan | FRA | 2019 | 164.0 |
64 | vita lukan | SLO | 2019 | 164.0 |
38 | lucia dörffel | GER | 2019 | 165.0 |
39 | lucka rakovec | SLO | 2019 | 170.0 |
37 | laura stöckler | AUT | 2019 | 172.0 |
59 | sienna kopf | USA | 2019 | 174.0 |
Jump in height from 2013 to 2015 in bouldering¶
df_f_boulder_13_15 = df_f_boulder.loc[(df_f_boulder['year'].isin([2013, 2014, 2015]))
& (df_f_boulder['height'].notnull())]
df_f_boulder_13_15 = df_f_boulder_13_15.groupby(['full_name'], as_index=False)
df_f_boulder_13_15 = df_f_boulder_13_15.agg({'full_name': 'first', 'country': 'first',
'year': lambda x: list(x), 'height': 'mean'})
df_f_boulder_13_15.sort_values('height')
full_name | country | year | height | |
---|---|---|---|---|
6 | dinara fakhritdinova | RUS | [2014, 2013] | 152.0 |
14 | jain kim | KOR | [2013] | 152.0 |
19 | manon hily | FRA | [2014] | 154.0 |
20 | mei kotake | JPN | [2014] | 155.0 |
4 | anne-sophie koller | SUI | [2013] | 160.0 |
2 | alma bestvater | GER | [2015, 2014, 2013] | 161.0 |
24 | risa ota | JPN | [2013] | 161.0 |
7 | elnaz rekabi | IRI | [2014] | 161.0 |
22 | mina markovic | SLO | [2014, 2013] | 161.0 |
23 | petra klingler | SUI | [2015, 2014, 2013] | 162.0 |
18 | kyra condie | USA | [2015, 2014, 2013] | 162.0 |
21 | miho nonaka | JPN | [2015, 2014] | 163.0 |
13 | ievgeniia kazbekova | UKR | [2013] | 164.0 |
25 | shauna coxsey | GBR | [2015, 2014, 2013] | 164.0 |
15 | julia chanourdie | FRA | [2015, 2013] | 164.0 |
27 | tina johnsen hafsaas | NOR | [2015] | 164.0 |
3 | andrea kümin | SUI | [2013] | 164.0 |
12 | hélène janicot | FRA | [2014, 2013] | 165.0 |
9 | fanny gibert | FRA | [2015, 2014, 2013] | 165.0 |
5 | chloe caulier | BEL | [2015, 2014, 2013] | 166.0 |
0 | akiyo noguchi | JPN | [2015, 2014, 2013] | 167.0 |
10 | franziska sterrer | AUT | [2015] | 169.0 |
11 | hung ying lee | TPE | [2014] | 170.0 |
1 | allison vest | CAN | [2015] | 171.0 |
17 | katja debevec | SLO | [2015, 2014] | 173.0 |
16 | julija kruder | SLO | [2015, 2014, 2013] | 175.0 |
26 | stasa gejo | SRB | [2015] | 175.0 |
8 | emma horan | AUS | [2015, 2014] | 181.0 |
Comments
Post a Comment