Skip to content

Instantly share code, notes, and snippets.

@rahimnathwani
Created April 28, 2025 05:21
Show Gist options
  • Save rahimnathwani/e8aa1d9ea3d1f8a95cba4989a6976806 to your computer and use it in GitHub Desktop.
Save rahimnathwani/e8aa1d9ea3d1f8a95cba4989a6976806 to your computer and use it in GitHub Desktop.
import numpy as np
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# --- Parameters ---
# Spearman's early work involved small numbers of tests/variables and modest samples.
n_subjects = 200 # Number of simulated individuals
n_tests = 6 # Number of observable tests (e.g., different school subjects or simple tasks)
n_abilities = 150 # Total number of underlying independent abilities (needs to be > n_tests)
# Each test 'samples' a random number of these independent abilities
abilities_per_test_range = (20, 50) # Min/max number of abilities contributing to a single test score
# Add some noise to simulate imperfect measurement
noise_std_dev = 0.6
# --- Simulation Steps ---
# 1. Generate Underlying Independent Abilities
# Each ability score for each subject is drawn independently from a standard normal distribution.
np.random.seed(42) # Set seed for reproducible results
abilities = np.random.randn(n_subjects, n_abilities)
# Shape: (n_subjects, n_abilities)
# 2. Create the 'Blueprint': Which abilities contribute to which test?
# We create a matrix where rows=tests, cols=abilities. 1 means the ability contributes.
test_ability_map = np.zeros((n_tests, n_abilities))
print("Number of abilities sampled by each test:")
for i in range(n_tests):
# Randomly choose how many abilities this test uses
num_linked_abilities = np.random.randint(abilities_per_test_range[0], abilities_per_test_range[1] + 1)
print(f" Test {i+1}: {num_linked_abilities}")
# Randomly choose *which* abilities this test uses
ability_indices = np.random.choice(n_abilities, num_linked_abilities, replace=False)
# Mark these abilities in the map
test_ability_map[i, ability_indices] = 1
# 3. Calculate 'True' Test Scores
# Each subject's score on a test is the sum of their scores on the abilities linked to that test.
# We can do this efficiently with matrix multiplication:
# (n_subjects, n_abilities) @ (n_abilities, n_tests) -> (n_subjects, n_tests)
true_test_scores = abilities @ test_ability_map.T
# 4. Add Measurement Noise
# Simulate that real-world tests aren't perfect measures
noise = np.random.normal(loc=0, scale=noise_std_dev, size=true_test_scores.shape)
observed_test_scores = true_test_scores + noise
# 5. Standardize the Observed Test Scores
# Factor analysis typically works on correlations, so we standardize the data (mean=0, std=1)
scaler = StandardScaler()
scaled_scores = scaler.fit_transform(observed_test_scores)
# --- Analysis ---
# 6. Examine the Correlation Matrix of Observed Test Scores
# We expect positive correlations because tests share underlying abilities by chance.
print("\n" + "="*40)
print("Correlation Matrix of Observed Test Scores:")
print("="*40)
correlation_matrix = np.corrcoef(scaled_scores.T) # Use transpose: rows=variables for corrcoef
# Use pandas for nicer display
corr_df = pd.DataFrame(correlation_matrix,
index=[f'Test_{i+1}' for i in range(n_tests)],
columns=[f'Test_{i+1}' for i in range(n_tests)])
print(corr_df.round(3))
# Optional: Visualize the correlation matrix
plt.figure(figsize=(7, 5))
sns.heatmap(corr_df, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
plt.title("Correlation Matrix of Simulated Test Scores\n(from Independent Abilities)")
plt.tight_layout()
plt.show()
# 7. Perform Factor Analysis
# We ask for just one factor, representing the hypothetical 'g'
print("\n" + "="*40)
print("Factor Analysis Results (Extracting 1 Factor):")
print("="*40)
fa = FactorAnalysis(n_components=1, random_state=0, svd_method='lapack') # Use lapack for stability
fa.fit(scaled_scores)
# 8. Examine the Factor Loadings
# These show how strongly each test relates to the extracted factor.
# We expect them to be mostly positive if a 'g' factor is found.
loadings = fa.components_.T # fa.components_ is (n_components, n_features)
loadings_df = pd.DataFrame(loadings,
index=[f'Test_{i+1}' for i in range(n_tests)],
columns=['Loading_on_Factor_1 (g)'])
print("Factor Loadings:")
print(loadings_df.round(3))
# Calculate variance potentially explained by this factor
# NOTE: sklearn's FactorAnalysis doesn't directly give % variance like PCA.
# We can estimate it based on loadings squared (communality for 1 factor).
communalities = np.sum(loadings**2, axis=1)
total_variance = n_tests # Since data is standardized, total variance is approx. n_features
variance_explained_by_factor = np.sum(communalities)
prop_variance_explained = variance_explained_by_factor / total_variance
print(f"\nApprox. Proportion of Variance Explained by Factor 1: {prop_variance_explained:.3f}")
# --- Interpretation ---
print("\n" + "="*40)
print("Interpretation:")
print("="*40)
print(f"We simulated {n_subjects} subjects taking {n_tests} tests.")
print(f"The scores were generated by summing subsets of {n_abilities} entirely *independent* underlying abilities.")
print("Critically, **no actual general factor ('g')** was part of the simulation's design.")
print("\nResults Analysis:")
print("- The correlation matrix shows predominantly positive correlations between tests. This 'positive manifold' arises simply because tests randomly sampled overlapping sets of the independent abilities.")
print("- Factor Analysis successfully extracted a single factor.")
print("- The loadings on this factor are all positive, resembling a 'g' factor.")
print(f"- This single factor accounts for approximately {prop_variance_explained:.1%} of the total variance across tests.")
print("\nConclusion:")
print("This simulation illustrates Godfrey Thomson's point: the appearance of a general factor 'g' ")
print("from factor analysis can be a mathematical artifact resulting from the correlations caused")
print("by overlapping pools of many independent underlying abilities, rather than definitive proof")
print("of a single, unified causal factor.")
print("="*40)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment