Thanks for the reply!
Code follows.
I added the verify_npz option after discovering this issue (though I’m not sure how robust it is).
I don’t have one of the bad .npz files at hand, but it seems most of the y
values were 0, with a smattering of other sleep stages (1-4).
I didn’t notice this problem with 6 threads (although it may have occurred in a mild form), but when I increased threads to 20, it was very obvious something was up when my model predicted most everything was sleep stage 0.
process_ann_eeg()
filters and epochs the annotated eeg file multiple times, returning those data in the features
variable, which is a numpy array. groups
is just a file identifier.
A thought I just had: could the X, y arrays go out of scope before np.savez is finished writing them?
def process_file(rml_filename, verify_npz=False, provenance="none"):
filename, ext = splitext(rml_filename)
eeg_filename = filename + "[001].edf"
base_filename = basename(filename)
annotated_eeg_filename = join(
f"../data/annotated/", base_filename + f"_{FS}Hz_ann_raw.fif"
)
if exists(annotated_eeg_filename):
annotated_eeg = mne.io.read_raw(annotated_eeg_filename, preload=True)
else:
annotated_eeg = label_eeg_using_rml(eeg_filename, rml_filename)
epoched_eeg, features, group = process_ann_eeg(annotated_eeg)
output_width, num_channels = features.shape[1], features.shape[2]
num_epochs = len(epoched_eeg.events)
X = np.empty((num_epochs, output_width, num_channels), dtype=np.float32)
y = np.empty((num_epochs,), dtype=np.int16)
# annotations = get_contiguous_annotations(epoched_eeg)
print(f"Processing {group}: {num_epochs} events")
for ep_ndx in range(num_epochs):
X[ep_ndx, :, :] = features[ep_ndx, :, :]
y[ep_ndx] = epoched_eeg.events[ep_ndx][2] # class
not_bad = y != 5
X = X[not_bad, :, :]
y = y[not_bad]
if len(y) == 0:
print(f"Skipped {group}, y is empty")
return
classes, counts = np.unique(y, return_counts=True)
for ndx in range(len(classes)):
print(f"{ALL_STAGES[int(classes[ndx])]:10s}: {counts[ndx]:7,d}")
processed_filename = (
f"../data/processed_data/{group}_{FS}Hz_{NUM_SAMPLES}x{NUM_CHANNELS}.npz"
)
np.savez(
processed_filename,
X=X,
y=y,
group=group,
provenance=provenance,
)
print(f"Saved: {processed_filename}")
if verify_npz:
data = np.load(processed_filename)
assert np.array_equal(X, data["X"]), "X did not verify!"
assert np.array_equal(y, data["y"]), "y did not verify!"
assert np.array_equal(group, data["group"]), "group did not verify!"
assert provenance == data["provenance"], "provenance did not verify!"
print("Verified")
if not exists(annotated_eeg_filename):
annotated_eeg.save(annotated_eeg_filename)