-
-
Save rahimnathwani/e8aa1d9ea3d1f8a95cba4989a6976806 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.decomposition import FactorAnalysis | |
from sklearn.preprocessing import StandardScaler | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# --- Parameters --- | |
# Spearman's early work involved small numbers of tests/variables and modest samples. | |
n_subjects = 200 # Number of simulated individuals | |
n_tests = 6 # Number of observable tests (e.g., different school subjects or simple tasks) | |
n_abilities = 150 # Total number of underlying independent abilities (needs to be > n_tests) | |
# Each test 'samples' a random number of these independent abilities | |
abilities_per_test_range = (20, 50) # Min/max number of abilities contributing to a single test score | |
# Add some noise to simulate imperfect measurement | |
noise_std_dev = 0.6 | |
# --- Simulation Steps --- | |
# 1. Generate Underlying Independent Abilities | |
# Each ability score for each subject is drawn independently from a standard normal distribution. | |
np.random.seed(42) # Set seed for reproducible results | |
abilities = np.random.randn(n_subjects, n_abilities) | |
# Shape: (n_subjects, n_abilities) | |
# 2. Create the 'Blueprint': Which abilities contribute to which test? | |
# We create a matrix where rows=tests, cols=abilities. 1 means the ability contributes. | |
test_ability_map = np.zeros((n_tests, n_abilities)) | |
print("Number of abilities sampled by each test:") | |
for i in range(n_tests): | |
# Randomly choose how many abilities this test uses | |
num_linked_abilities = np.random.randint(abilities_per_test_range[0], abilities_per_test_range[1] + 1) | |
print(f" Test {i+1}: {num_linked_abilities}") | |
# Randomly choose *which* abilities this test uses | |
ability_indices = np.random.choice(n_abilities, num_linked_abilities, replace=False) | |
# Mark these abilities in the map | |
test_ability_map[i, ability_indices] = 1 | |
# 3. Calculate 'True' Test Scores | |
# Each subject's score on a test is the sum of their scores on the abilities linked to that test. | |
# We can do this efficiently with matrix multiplication: | |
# (n_subjects, n_abilities) @ (n_abilities, n_tests) -> (n_subjects, n_tests) | |
true_test_scores = abilities @ test_ability_map.T | |
# 4. Add Measurement Noise | |
# Simulate that real-world tests aren't perfect measures | |
noise = np.random.normal(loc=0, scale=noise_std_dev, size=true_test_scores.shape) | |
observed_test_scores = true_test_scores + noise | |
# 5. Standardize the Observed Test Scores | |
# Factor analysis typically works on correlations, so we standardize the data (mean=0, std=1) | |
scaler = StandardScaler() | |
scaled_scores = scaler.fit_transform(observed_test_scores) | |
# --- Analysis --- | |
# 6. Examine the Correlation Matrix of Observed Test Scores | |
# We expect positive correlations because tests share underlying abilities by chance. | |
print("\n" + "="*40) | |
print("Correlation Matrix of Observed Test Scores:") | |
print("="*40) | |
correlation_matrix = np.corrcoef(scaled_scores.T) # Use transpose: rows=variables for corrcoef | |
# Use pandas for nicer display | |
corr_df = pd.DataFrame(correlation_matrix, | |
index=[f'Test_{i+1}' for i in range(n_tests)], | |
columns=[f'Test_{i+1}' for i in range(n_tests)]) | |
print(corr_df.round(3)) | |
# Optional: Visualize the correlation matrix | |
plt.figure(figsize=(7, 5)) | |
sns.heatmap(corr_df, annot=True, cmap='viridis', fmt=".2f", linewidths=.5) | |
plt.title("Correlation Matrix of Simulated Test Scores\n(from Independent Abilities)") | |
plt.tight_layout() | |
plt.show() | |
# 7. Perform Factor Analysis | |
# We ask for just one factor, representing the hypothetical 'g' | |
print("\n" + "="*40) | |
print("Factor Analysis Results (Extracting 1 Factor):") | |
print("="*40) | |
fa = FactorAnalysis(n_components=1, random_state=0, svd_method='lapack') # Use lapack for stability | |
fa.fit(scaled_scores) | |
# 8. Examine the Factor Loadings | |
# These show how strongly each test relates to the extracted factor. | |
# We expect them to be mostly positive if a 'g' factor is found. | |
loadings = fa.components_.T # fa.components_ is (n_components, n_features) | |
loadings_df = pd.DataFrame(loadings, | |
index=[f'Test_{i+1}' for i in range(n_tests)], | |
columns=['Loading_on_Factor_1 (g)']) | |
print("Factor Loadings:") | |
print(loadings_df.round(3)) | |
# Calculate variance potentially explained by this factor | |
# NOTE: sklearn's FactorAnalysis doesn't directly give % variance like PCA. | |
# We can estimate it based on loadings squared (communality for 1 factor). | |
communalities = np.sum(loadings**2, axis=1) | |
total_variance = n_tests # Since data is standardized, total variance is approx. n_features | |
variance_explained_by_factor = np.sum(communalities) | |
prop_variance_explained = variance_explained_by_factor / total_variance | |
print(f"\nApprox. Proportion of Variance Explained by Factor 1: {prop_variance_explained:.3f}") | |
# --- Interpretation --- | |
print("\n" + "="*40) | |
print("Interpretation:") | |
print("="*40) | |
print(f"We simulated {n_subjects} subjects taking {n_tests} tests.") | |
print(f"The scores were generated by summing subsets of {n_abilities} entirely *independent* underlying abilities.") | |
print("Critically, **no actual general factor ('g')** was part of the simulation's design.") | |
print("\nResults Analysis:") | |
print("- The correlation matrix shows predominantly positive correlations between tests. This 'positive manifold' arises simply because tests randomly sampled overlapping sets of the independent abilities.") | |
print("- Factor Analysis successfully extracted a single factor.") | |
print("- The loadings on this factor are all positive, resembling a 'g' factor.") | |
print(f"- This single factor accounts for approximately {prop_variance_explained:.1%} of the total variance across tests.") | |
print("\nConclusion:") | |
print("This simulation illustrates Godfrey Thomson's point: the appearance of a general factor 'g' ") | |
print("from factor analysis can be a mathematical artifact resulting from the correlations caused") | |
print("by overlapping pools of many independent underlying abilities, rather than definitive proof") | |
print("of a single, unified causal factor.") | |
print("="*40) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment