optimizations
This commit is contained in:
59
training.py
59
training.py
@@ -154,32 +154,59 @@ print("Loading Kaggle ASL dataset...")
|
|||||||
base_path = 'asl_kaggle'
|
base_path = 'asl_kaggle'
|
||||||
train_df, sign_to_idx = load_kaggle_asl_data(base_path)
|
train_df, sign_to_idx = load_kaggle_asl_data(base_path)
|
||||||
|
|
||||||
# Process landmarks
|
# Process landmarks with parallel processing
|
||||||
X = []
|
from multiprocessing import Pool, cpu_count
|
||||||
y = []
|
from functools import partial
|
||||||
|
|
||||||
print("\nProcessing landmark files...")
|
|
||||||
for idx, row in train_df.iterrows():
|
|
||||||
if idx % 1000 == 0:
|
|
||||||
print(f"Processed {idx}/{len(train_df)} sequences...")
|
|
||||||
|
|
||||||
|
def process_single_sequence(row, base_path):
|
||||||
|
"""Process a single sequence - designed for parallel execution"""
|
||||||
parquet_path = os.path.join(base_path, row['path'])
|
parquet_path = os.path.join(base_path, row['path'])
|
||||||
|
|
||||||
if not os.path.exists(parquet_path):
|
if not os.path.exists(parquet_path):
|
||||||
continue
|
return None, None
|
||||||
|
|
||||||
landmarks = extract_hand_landmarks_from_parquet(parquet_path)
|
try:
|
||||||
|
landmarks = extract_hand_landmarks_from_parquet(parquet_path)
|
||||||
|
|
||||||
if landmarks is None:
|
if landmarks is None:
|
||||||
continue
|
return None, None
|
||||||
|
|
||||||
features = get_optimized_features(landmarks)
|
features = get_optimized_features(landmarks)
|
||||||
|
|
||||||
if features is None:
|
if features is None:
|
||||||
continue
|
return None, None
|
||||||
|
|
||||||
X.append(features)
|
return features, row['sign']
|
||||||
y.append(row['sign'])
|
except Exception as e:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
print("\nProcessing landmark files with parallel processing...")
|
||||||
|
print(f"Using {cpu_count()} CPU cores")
|
||||||
|
|
||||||
|
# Convert DataFrame rows to list for parallel processing
|
||||||
|
rows_list = [row for _, row in train_df.iterrows()]
|
||||||
|
|
||||||
|
# Create partial function with base_path
|
||||||
|
process_func = partial(process_single_sequence, base_path=base_path)
|
||||||
|
|
||||||
|
# Process in parallel with progress updates
|
||||||
|
X = []
|
||||||
|
y = []
|
||||||
|
batch_size = 1000
|
||||||
|
|
||||||
|
with Pool(processes=cpu_count()) as pool:
|
||||||
|
for i in range(0, len(rows_list), batch_size):
|
||||||
|
batch = rows_list[i:i + batch_size]
|
||||||
|
results = pool.map(process_func, batch)
|
||||||
|
|
||||||
|
for features, sign in results:
|
||||||
|
if features is not None and sign is not None:
|
||||||
|
X.append(features)
|
||||||
|
y.append(sign)
|
||||||
|
|
||||||
|
print(f"Processed {min(i + batch_size, len(rows_list))}/{len(rows_list)} sequences... (Valid: {len(X)})")
|
||||||
|
|
||||||
print(f"\nSuccessfully processed {len(X)} sequences")
|
print(f"\nSuccessfully processed {len(X)} sequences")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user