rahimnathwani · April 28, 2025 05:21
diff --git a/g.py b/g.py
 import numpy as np
 from sklearn.decomposition import FactorAnalysis
 from sklearn.preprocessing import StandardScaler
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt

 # --- Parameters ---
 # Spearman's early work involved small numbers of tests/variables and modest samples.
 n_subjects = 200   # Number of simulated individuals
 n_tests = 6        # Number of observable tests (e.g., different school subjects or simple tasks)
 n_abilities = 150  # Total number of underlying independent abilities (needs to be > n_tests)

 # Each test 'samples' a random number of these independent abilities
 abilities_per_test_range = (20, 50) # Min/max number of abilities contributing to a single test score

 # Add some noise to simulate imperfect measurement
 noise_std_dev = 0.6

 # --- Simulation Steps ---

 # 1. Generate Underlying Independent Abilities
 # Each ability score for each subject is drawn independently from a standard normal distribution.
 np.random.seed(42) # Set seed for reproducible results
 abilities = np.random.randn(n_subjects, n_abilities)
 # Shape: (n_subjects, n_abilities)

 # 2. Create the 'Blueprint': Which abilities contribute to which test?
 # We create a matrix where rows=tests, cols=abilities. 1 means the ability contributes.
 test_ability_map = np.zeros((n_tests, n_abilities))
 print("Number of abilities sampled by each test:")
 for i in range(n_tests):
    # Randomly choose how many abilities this test uses
    num_linked_abilities = np.random.randint(abilities_per_test_range[0], abilities_per_test_range[1] + 1)
    print(f"  Test {i+1}: {num_linked_abilities}")
    # Randomly choose *which* abilities this test uses
    ability_indices = np.random.choice(n_abilities, num_linked_abilities, replace=False)
    # Mark these abilities in the map
    test_ability_map[i, ability_indices] = 1

 # 3. Calculate 'True' Test Scores
 # Each subject's score on a test is the sum of their scores on the abilities linked to that test.
 # We can do this efficiently with matrix multiplication:
 # (n_subjects, n_abilities) @ (n_abilities, n_tests) -> (n_subjects, n_tests)
 true_test_scores = abilities @ test_ability_map.T

 # 4. Add Measurement Noise
 # Simulate that real-world tests aren't perfect measures
 noise = np.random.normal(loc=0, scale=noise_std_dev, size=true_test_scores.shape)
 observed_test_scores = true_test_scores + noise

 # 5. Standardize the Observed Test Scores
 # Factor analysis typically works on correlations, so we standardize the data (mean=0, std=1)
 scaler = StandardScaler()
 scaled_scores = scaler.fit_transform(observed_test_scores)

 # --- Analysis ---

 # 6. Examine the Correlation Matrix of Observed Test Scores
 # We expect positive correlations because tests share underlying abilities by chance.
 print("\n" + "="*40)
 print("Correlation Matrix of Observed Test Scores:")
 print("="*40)
 correlation_matrix = np.corrcoef(scaled_scores.T) # Use transpose: rows=variables for corrcoef
 # Use pandas for nicer display
 corr_df = pd.DataFrame(correlation_matrix,
                       index=[f'Test_{i+1}' for i in range(n_tests)],
                       columns=[f'Test_{i+1}' for i in range(n_tests)])
 print(corr_df.round(3))

 # Optional: Visualize the correlation matrix
 plt.figure(figsize=(7, 5))
 sns.heatmap(corr_df, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
 plt.title("Correlation Matrix of Simulated Test Scores\n(from Independent Abilities)")
 plt.tight_layout()
 plt.show()


 # 7. Perform Factor Analysis
 # We ask for just one factor, representing the hypothetical 'g'
 print("\n" + "="*40)
 print("Factor Analysis Results (Extracting 1 Factor):")
 print("="*40)
 fa = FactorAnalysis(n_components=1, random_state=0, svd_method='lapack') # Use lapack for stability
 fa.fit(scaled_scores)

 # 8. Examine the Factor Loadings
 # These show how strongly each test relates to the extracted factor.
 # We expect them to be mostly positive if a 'g' factor is found.
 loadings = fa.components_.T # fa.components_ is (n_components, n_features)
 loadings_df = pd.DataFrame(loadings,
                           index=[f'Test_{i+1}' for i in range(n_tests)],
                           columns=['Loading_on_Factor_1 (g)'])
 print("Factor Loadings:")
 print(loadings_df.round(3))

 # Calculate variance potentially explained by this factor
 # NOTE: sklearn's FactorAnalysis doesn't directly give % variance like PCA.
 # We can estimate it based on loadings squared (communality for 1 factor).
 communalities = np.sum(loadings**2, axis=1)
 total_variance = n_tests # Since data is standardized, total variance is approx. n_features
 variance_explained_by_factor = np.sum(communalities)
 prop_variance_explained = variance_explained_by_factor / total_variance
 print(f"\nApprox. Proportion of Variance Explained by Factor 1: {prop_variance_explained:.3f}")

 # --- Interpretation ---
 print("\n" + "="*40)
 print("Interpretation:")
 print("="*40)
 print(f"We simulated {n_subjects} subjects taking {n_tests} tests.")
 print(f"The scores were generated by summing subsets of {n_abilities} entirely *independent* underlying abilities.")
 print("Critically, **no actual general factor ('g')** was part of the simulation's design.")
 print("\nResults Analysis:")
 print("- The correlation matrix shows predominantly positive correlations between tests. This 'positive manifold' arises simply because tests randomly sampled overlapping sets of the independent abilities.")
 print("- Factor Analysis successfully extracted a single factor.")
 print("- The loadings on this factor are all positive, resembling a 'g' factor.")
 print(f"- This single factor accounts for approximately {prop_variance_explained:.1%} of the total variance across tests.")
 print("\nConclusion:")
 print("This simulation illustrates Godfrey Thomson's point: the appearance of a general factor 'g' ")
 print("from factor analysis can be a mathematical artifact resulting from the correlations caused")
 print("by overlapping pools of many independent underlying abilities, rather than definitive proof")
 print("of a single, unified causal factor.")
 print("="*40)
	import numpy as np
	from sklearn.decomposition import FactorAnalysis
	from sklearn.preprocessing import StandardScaler
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	# --- Parameters ---
	# Spearman's early work involved small numbers of tests/variables and modest samples.
	n_subjects = 200 # Number of simulated individuals
	n_tests = 6 # Number of observable tests (e.g., different school subjects or simple tasks)
	n_abilities = 150 # Total number of underlying independent abilities (needs to be > n_tests)

	# Each test 'samples' a random number of these independent abilities
	abilities_per_test_range = (20, 50) # Min/max number of abilities contributing to a single test score

	# Add some noise to simulate imperfect measurement
	noise_std_dev = 0.6

	# --- Simulation Steps ---

	# 1. Generate Underlying Independent Abilities
	# Each ability score for each subject is drawn independently from a standard normal distribution.
	np.random.seed(42) # Set seed for reproducible results
	abilities = np.random.randn(n_subjects, n_abilities)
	# Shape: (n_subjects, n_abilities)

	# 2. Create the 'Blueprint': Which abilities contribute to which test?
	# We create a matrix where rows=tests, cols=abilities. 1 means the ability contributes.
	test_ability_map = np.zeros((n_tests, n_abilities))
	print("Number of abilities sampled by each test:")
	for i in range(n_tests):
	# Randomly choose how many abilities this test uses
	num_linked_abilities = np.random.randint(abilities_per_test_range[0], abilities_per_test_range[1] + 1)
	print(f" Test {i+1}: {num_linked_abilities}")
	# Randomly choose which abilities this test uses
	ability_indices = np.random.choice(n_abilities, num_linked_abilities, replace=False)
	# Mark these abilities in the map
	test_ability_map[i, ability_indices] = 1

	# 3. Calculate 'True' Test Scores
	# Each subject's score on a test is the sum of their scores on the abilities linked to that test.
	# We can do this efficiently with matrix multiplication:
	# (n_subjects, n_abilities) @ (n_abilities, n_tests) -> (n_subjects, n_tests)
	true_test_scores = abilities @ test_ability_map.T

	# 4. Add Measurement Noise
	# Simulate that real-world tests aren't perfect measures
	noise = np.random.normal(loc=0, scale=noise_std_dev, size=true_test_scores.shape)
	observed_test_scores = true_test_scores + noise

	# 5. Standardize the Observed Test Scores
	# Factor analysis typically works on correlations, so we standardize the data (mean=0, std=1)
	scaler = StandardScaler()
	scaled_scores = scaler.fit_transform(observed_test_scores)

	# --- Analysis ---

	# 6. Examine the Correlation Matrix of Observed Test Scores
	# We expect positive correlations because tests share underlying abilities by chance.
	print("\n" + "="*40)
	print("Correlation Matrix of Observed Test Scores:")
	print("="*40)
	correlation_matrix = np.corrcoef(scaled_scores.T) # Use transpose: rows=variables for corrcoef
	# Use pandas for nicer display
	corr_df = pd.DataFrame(correlation_matrix,
	index=[f'Test_{i+1}' for i in range(n_tests)],
	columns=[f'Test_{i+1}' for i in range(n_tests)])
	print(corr_df.round(3))

	# Optional: Visualize the correlation matrix
	plt.figure(figsize=(7, 5))
	sns.heatmap(corr_df, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
	plt.title("Correlation Matrix of Simulated Test Scores\n(from Independent Abilities)")
	plt.tight_layout()
	plt.show()


	# 7. Perform Factor Analysis
	# We ask for just one factor, representing the hypothetical 'g'
	print("\n" + "="*40)
	print("Factor Analysis Results (Extracting 1 Factor):")
	print("="*40)
	fa = FactorAnalysis(n_components=1, random_state=0, svd_method='lapack') # Use lapack for stability
	fa.fit(scaled_scores)

	# 8. Examine the Factor Loadings
	# These show how strongly each test relates to the extracted factor.
	# We expect them to be mostly positive if a 'g' factor is found.
	loadings = fa.components_.T # fa.components_ is (n_components, n_features)
	loadings_df = pd.DataFrame(loadings,
	index=[f'Test_{i+1}' for i in range(n_tests)],
	columns=['Loading_on_Factor_1 (g)'])
	print("Factor Loadings:")
	print(loadings_df.round(3))

	# Calculate variance potentially explained by this factor
	# NOTE: sklearn's FactorAnalysis doesn't directly give % variance like PCA.
	# We can estimate it based on loadings squared (communality for 1 factor).
	communalities = np.sum(loadings**2, axis=1)
	total_variance = n_tests # Since data is standardized, total variance is approx. n_features
	variance_explained_by_factor = np.sum(communalities)
	prop_variance_explained = variance_explained_by_factor / total_variance
	print(f"\nApprox. Proportion of Variance Explained by Factor 1: {prop_variance_explained:.3f}")

	# --- Interpretation ---
	print("\n" + "="*40)
	print("Interpretation:")
	print("="*40)
	print(f"We simulated {n_subjects} subjects taking {n_tests} tests.")
	print(f"The scores were generated by summing subsets of {n_abilities} entirely independent underlying abilities.")
	print("Critically, no actual general factor ('g') was part of the simulation's design.")
	print("\nResults Analysis:")
	print("- The correlation matrix shows predominantly positive correlations between tests. This 'positive manifold' arises simply because tests randomly sampled overlapping sets of the independent abilities.")
	print("- Factor Analysis successfully extracted a single factor.")
	print("- The loadings on this factor are all positive, resembling a 'g' factor.")
	print(f"- This single factor accounts for approximately {prop_variance_explained:.1%} of the total variance across tests.")
	print("\nConclusion:")
	print("This simulation illustrates Godfrey Thomson's point: the appearance of a general factor 'g' ")
	print("from factor analysis can be a mathematical artifact resulting from the correlations caused")
	print("by overlapping pools of many independent underlying abilities, rather than definitive proof")
	print("of a single, unified causal factor.")
	print("="*40)