diff --git a/docs/changes/968.bugfix.rst b/docs/changes/968.bugfix.rst new file mode 100644 index 000000000..a4d26acb8 --- /dev/null +++ b/docs/changes/968.bugfix.rst @@ -0,0 +1 @@ +Speedup and improve robustness of fill_bad_time_intervals diff --git a/stingray/base.py b/stingray/base.py index 63c3575d3..7df58cf05 100644 --- a/stingray/base.py +++ b/stingray/base.py @@ -24,6 +24,7 @@ get_random_state, find_nearest, rebin_data, + show_progress, ) from .gti import ( create_gti_mask, @@ -36,6 +37,7 @@ get_total_gti_length, bin_intervals_from_gtis, time_intervals_from_gtis, + eliminate_short_gtis, ) from typing import TYPE_CHECKING, Type, TypeVar, Union @@ -2276,9 +2278,12 @@ def fill_bad_time_intervals( Random seed to use for the simulation. If None, a random seed is generated. """ - + # Initialize random number generator for reproducibility rs = get_random_state(seed) + # Determine which attributes should be filled with random samples from the buffer, + # and which should be left alone (filled with NaN or default values). + # Time and mask are always treated specially and excluded from randomization. if attrs_to_randomize is None: attrs_to_randomize = self.array_attrs() + self.internal_array_attrs() for attr in ["time", "_mask"]: @@ -2291,17 +2296,21 @@ def fill_bad_time_intervals( if a not in attrs_to_randomize ] + # Set default max_length: only fill gaps shorter than 1/100th of the longest GTI. + # This ensures we only fill very short gaps where white noise injection is acceptable. if max_length is None: max_length = np.max(self.gti[:, 1] - self.gti[:, 0]) / 100 - - btis = get_btis(self.gti, self.time[0], self.time[-1]) - if len(btis) == 0: + # + # If there's only one GTI (or none), there are no gaps between GTIs to fill + if len(self.gti) <= 1: logger.info("No bad time intervals to fill") return copy.deepcopy(self) + + # Get only the valid (non-masked) time stamps to work with filtered_times = self.time[self.mask] - new_times = [filtered_times.copy()] - new_attrs = {} + # Determine if data is evenly sampled by comparing median time separation to dt. + # This affects how we generate new time stamps in the gaps. mean_data_separation = np.median(np.diff(filtered_times)) if even_sampling is None: # The time series is considered evenly sampled if the median separation between @@ -2311,44 +2320,95 @@ def fill_bad_time_intervals( even_sampling = True logger.info(f"Data are {'not' if not even_sampling else ''} evenly sampled") + # Estimate how many samples typically fit in a gap and in a GTI. + # This helps determine a reasonable buffer size for sampling statistics. + gti_lengths = self.gti[:, 1] - self.gti[:, 0] if even_sampling: est_samples_in_gap = int(max_length / self.dt) + max_samples_per_gti = int(gti_lengths.max() / self.dt) else: est_samples_in_gap = int(max_length / mean_data_separation) + max_samples_per_gti = int(gti_lengths.max() / mean_data_separation) if buffer_size is None: - buffer_size = max(100, est_samples_in_gap) + # Ideal buffer size: a lot longer than the estimated number of samples in the gap + # (10x) to get good statistics for random sampling + buffer_size = est_samples_in_gap * 10 + # However, this might be too much for short GTIs, so we set an upper limit to the + # buffer size equal to the length of the longest GTI. + buffer_size = min(buffer_size, max_samples_per_gti) + # Still, no less than 1 + buffer_size = max(buffer_size, 1) + + # Eliminate series of GTIs that are shorter than max_length, to avoid particularly + # problematic situations (ex. in NICER where many short GTIs can occur) + gti = eliminate_short_gtis(self.gti, max_length / 2, only_repeated=True) + if len(gti) < len(self.gti): + new_ts = self.apply_gtis(gti, inplace=False) + filtered_times = new_ts.time + else: + new_ts = self + # Initialize output containers: start with the original valid data + new_times = [filtered_times.copy()] + new_attrs = {} + + # Get Bad Time Intervals (BTIs) - the gaps between GTIs that we might fill + btis = get_btis(gti, self.time[0], self.time[-1]) + + # Track which new GTIs we're adding (one for each filled gap) added_gtis = [] total_filled_time = 0 - for bti in btis: + for bti in show_progress(btis): length = bti[1] - bti[0] + + # Skip gaps that are too long - we only fill short gaps if length > max_length: continue + logger.info(f"Filling bad time interval {bti} ({length:.4f} s)") - epsilon = 1e-5 * length - added_gtis.append([bti[0] - epsilon, bti[1] + epsilon]) + + # Find the indices in filtered_times closest to the gap boundaries. + # filt_low_idx: index of the last valid point before the gap + # filt_hig_idx: index of the first valid point after the gap filt_low_t, filt_low_idx = find_nearest(filtered_times, bti[0]) filt_hig_t, filt_hig_idx = find_nearest(filtered_times, bti[1], side="right") + + # ---------- Generate new time stamps for the gap ---------- if even_sampling: - local_new_times = np.arange(bti[0] + self.dt / 2, bti[1], self.dt) + # For evenly sampled data: create a regular time grid within the gap. + # Start at bti[0] + dt/2 to place bin centers correctly. + local_new_times = np.arange(bti[0] + new_ts.dt / 2, bti[1], new_ts.dt) nevents = local_new_times.size else: + # For unevenly sampled data (e.g., event lists): estimate count rate + # from the buffer regions on either side of the gap, then draw a + # Poisson-distributed number of events and place them uniformly. + + # Get buffer_size points before the gap (but stay within valid range) low_time_arr = filtered_times[max(filt_low_idx - buffer_size, 0) : filt_low_idx] + # Only keep points that are within buffer_size seconds of the gap low_time_arr = low_time_arr[low_time_arr > bti[0] - buffer_size] + + # Get buffer_size points after the gap high_time_arr = filtered_times[filt_hig_idx : buffer_size + filt_hig_idx] high_time_arr = high_time_arr[high_time_arr < bti[1] + buffer_size] + # Calculate count rate from the buffer before the gap + # count_rate = number_of_events / time_span if len(low_time_arr) > 0 and (filt_low_t - low_time_arr[0]) > 0: - ctrate_low = np.count_nonzero(low_time_arr) / (filt_low_t - low_time_arr[0]) + ctrate_low = np.size(low_time_arr) / (filt_low_t - low_time_arr[0]) else: ctrate_low = np.nan + + # Calculate count rate from the buffer after the gap if len(high_time_arr) > 0 and (high_time_arr[-1] - filt_hig_t) > 0: - ctrate_high = np.count_nonzero(high_time_arr) / (high_time_arr[-1] - filt_hig_t) + ctrate_high = np.size(high_time_arr) / (high_time_arr[-1] - filt_hig_t) else: ctrate_high = np.nan + # If we couldn't estimate count rate from either side, skip this gap if not np.isfinite(ctrate_low) and not np.isfinite(ctrate_high): warnings.warn( f"No valid data around to simulate the time series in interval " @@ -2356,39 +2416,76 @@ def fill_bad_time_intervals( f"adequate." ) continue + + # Average the count rates (nanmean ignores NaN values) ctrate = np.nanmean([ctrate_low, ctrate_high]) - nevents = rs.poisson(ctrate * (bti[1] - bti[0])) - local_new_times = rs.uniform(bti[0], bti[1], nevents) + expected_counts = ctrate * (bti[1] - bti[0]) + # Draw number of events from Poisson distribution with expected value = rate * duration + nevents = rs.poisson(expected_counts) + # Place events uniformly within the gap + local_new_times = np.sort(rs.uniform(bti[0], bti[1], nevents)) + + # Create a new GTI that covers this gap (with tiny epsilon margin to avoid + # floating-point edge issues when merging GTIs later) + epsilon = 1e-5 * length + added_gtis.append([bti[0] - epsilon, bti[1] + epsilon]) + + # Add the new times to our collection new_times.append(local_new_times) + # ---------- Fill array attributes with random samples from buffer ---------- for attr in attrs_to_randomize: - low_arr = getattr(self, attr)[max(buffer_size - filt_low_idx, 0) : filt_low_idx] - high_arr = getattr(self, attr)[filt_hig_idx : buffer_size + filt_hig_idx] + # Get attribute values from buffer before the gap + # Note: the indexing here seems to have a bug - should probably be: + # getattr(self, attr)[max(filt_low_idx - buffer_size, 0) : filt_low_idx] + low_arr = getattr(new_ts, attr)[max(filt_low_idx - buffer_size, 0) : filt_low_idx] + # Get attribute values from buffer after the gap + high_arr = getattr(new_ts, attr)[filt_hig_idx : buffer_size + filt_hig_idx] + + # Initialize the output list with the original valid data (only once) if attr not in new_attrs: - new_attrs[attr] = [getattr(self, attr)[self.mask]] + new_attrs[attr] = [getattr(new_ts, attr)[new_ts.mask]] + + # Randomly sample from the combined buffer to fill the gap. + # This preserves the empirical distribution of values near the gap. new_attrs[attr].append(rs.choice(np.concatenate([low_arr, high_arr]), nevents)) + + # ---------- Handle attributes that shouldn't be randomized ---------- for attr in attrs_to_leave_alone: if attr not in new_attrs: - new_attrs[attr] = [getattr(self, attr)[self.mask]] + new_attrs[attr] = [getattr(new_ts, attr)[new_ts.mask]] + + # For mask: new points are always valid (True) + # For other attrs: fill with NaN to indicate simulated/unknown values if attr == "_mask": new_attrs[attr].append(np.ones(nevents, dtype=bool)) else: new_attrs[attr].append(np.zeros(nevents) + np.nan) + total_filled_time += length logger.info(f"A total of {total_filled_time} s of data were simulated") - new_gtis = join_gtis(self.gti, added_gtis) + # ==================== PHASE 7: ASSEMBLE OUTPUT ==================== + # Merge the original GTIs with the new GTIs covering filled gaps + new_gtis = join_gtis(gti, added_gtis) + + # Concatenate all time arrays (original + filled gaps) and sort by time new_times = np.concatenate(new_times) order = np.argsort(new_times) - new_obj = type(self)() + + # Create new time series object of the same type as self + new_obj = type(new_ts)() new_obj.time = new_times[order] + # Copy all metadata attributes (scalars like mjdref, dt, etc.) for attr in self.meta_attrs(): setattr(new_obj, attr, getattr(self, attr)) + # Set array attributes, applying the same sort order used for time for attr, values in new_attrs.items(): setattr(new_obj, attr, np.concatenate(values)[order]) + new_obj.gti = new_gtis return new_obj @@ -2506,7 +2603,7 @@ def plot( alpha=0.5, facecolor="r", zorder=1, - edgecolor="none", + edgecolor="r", ) return ax @@ -2660,12 +2757,21 @@ def analyze_segments(self, func, segment_size, fraction_step=1, **kwargs): True """ + even_sampling = False + mean_data_separation = np.median(np.diff(self.time)) + if ( + self.dt is not None + and self.dt > 0 + and np.isclose(mean_data_separation, self.dt, rtol=0.01) + ): + even_sampling = True + if segment_size is None: start_times = self.gti[:, 0] stop_times = self.gti[:, 1] start = np.searchsorted(self.time, start_times) stop = np.searchsorted(self.time, stop_times) - elif self.dt > 0: + elif even_sampling: start, stop = bin_intervals_from_gtis( self.gti, segment_size, self.time, fraction_step=fraction_step, dt=self.dt ) diff --git a/stingray/gti.py b/stingray/gti.py index 0291c5976..fd2d60bb5 100644 --- a/stingray/gti.py +++ b/stingray/gti.py @@ -1848,3 +1848,73 @@ def split_gtis_by_exposure(gtis, exposure_per_chunk, new_interval_if_gti_sep=Non vals = split_gtis_at_indices(gtis, index_list) return vals + + +def eliminate_short_gtis(gtis, min_length, only_repeated=False): + """Eliminate GTIs shorter than a given length. + + Parameters + ---------- + gtis : 2-d float array + List of GTIs of the form ``[[gti0_0, gti0_1], [gti1_0, gti1_1], ...]`` + min_length : float + Minimum length of GTIs to keep, in seconds + + Other Parameters + ---------------- + only_repeated : bool + If True, only eliminate GTIs that are shorter than the given length and are + separated by less than the given length from another problematic GTI. If False, eliminate + all GTIs shorter than the given length, regardless of their separation from other GTIs. + + Returns + ------- + new_gtis : 2-d float array + List of GTIs of the form ``[[gti0_0, gti0_1], [gti1_0, gti1_1], ...]``, + with GTIs shorter than the given length removed + + Examples + -------- + ``min_length`` is shorter than all GTIs, so no GTI is removed: + >>> gtis = [[0, 30], [40, 41], [90, 120], [130, 160]] + >>> new_gtis = eliminate_short_gtis(gtis, 0.5) + >>> assert np.allclose(new_gtis, [[0, 30], [40, 41], [90, 120], [130, 160]]) + + ``min_length`` is longer than one GTI, so that GTI is removed: + >>> new_gtis = eliminate_short_gtis(gtis, 2) + >>> assert np.allclose(new_gtis, [[0, 30], [90, 120], [130, 160]]) + + ``min_length`` is longer than one GTI, but ``only_repeated=True`` only removes GTIs + that are part of a repeated pattern of short GTIs: + >>> new_gtis = eliminate_short_gtis(gtis, 2, only_repeated=True) + >>> assert np.allclose(new_gtis, [[0, 30], [40, 41], [90, 120], [130, 160]]) + + Two short GTIs in a row, separated by less than ``min_length``, are removed when + ``only_repeated=True``: + >>> gtis = [[0, 30], [40, 41], [41.5, 42], [90, 120], [130, 160]] + >>> new_gtis = eliminate_short_gtis(gtis, 2, only_repeated=True) + >>> assert np.allclose(new_gtis, [[0, 30], [90, 120], [130, 160]]) + + Four short GTIs in a row and ``only_repeated=True``, but only three of them are separated by + less than ``min_length``. Those three are removed, while the one that is separated by more + than ``min_length`` from the others is kept: + >>> gtis = [[0, 30], [31, 32], [40, 40.5], [41, 41.3], [41.5, 42], [90, 120], [130, 160]] + >>> new_gtis = eliminate_short_gtis(gtis, 2, only_repeated=True) + >>> assert np.allclose(new_gtis, [[0, 30], [31, 32], [90, 120], [130, 160]]) + + """ + gtis = np.asanyarray(gtis) + lengths = gtis[:, 1] - gtis[:, 0] + if only_repeated: + separation = np.diff(gtis[:, 0]) + short_mask = lengths < min_length + # Find two bads in a row, close to one another. + + mask = np.ones_like(short_mask, dtype=bool) + for i in range(len(short_mask) - 1): + if (short_mask[i] and short_mask[i + 1]) and separation[i] < min_length: + mask[i] = False + mask[i + 1] = False + else: + mask = lengths >= min_length + return gtis[mask] diff --git a/stingray/tests/test_base.py b/stingray/tests/test_base.py index 378edfb41..c8bd808cf 100644 --- a/stingray/tests/test_base.py +++ b/stingray/tests/test_base.py @@ -1365,18 +1365,19 @@ def test_join_ignore_attr(self): class TestFillBTI(object): @classmethod def setup_class(cls): - cls.rand_time = np.sort(np.random.uniform(0, 1000, 100000)) - cls.rand_ener = np.random.uniform(0, 100, 100000) + cls.ctrate = 1000 + cls.rand_time = np.sort(np.random.uniform(0, 1000, int(cls.ctrate * 1000))) + cls.rand_ener = np.random.uniform(0, 100, int(cls.ctrate * 1000)) cls.gti = [[0, 900], [950, 1000]] - blablas = np.random.normal(0, 1, cls.rand_ener.size) + blablas = np.random.normal(100, 0.01, cls.rand_ener.size) cls.ev_like = StingrayTimeseries( cls.rand_time, energy=cls.rand_ener, blablas=blablas, gti=cls.gti ) time_edges = np.linspace(0, 1000, 1001) counts = np.histogram(cls.rand_time, bins=time_edges)[0] - blablas = np.random.normal(0, 1, 1000) + blublus = np.random.normal(100, 0.01, 1000) cls.lc_like = StingrayTimeseries( - time=time_edges[:-1] + 0.5, counts=counts, blablas=blablas, gti=cls.gti, dt=1 + time=time_edges[:-1] + 0.5, counts=counts, blublus=blublus, gti=cls.gti, dt=1 ) def test_no_btis_returns_copy(self): @@ -1384,13 +1385,23 @@ def test_no_btis_returns_copy(self): ts_new = ts.fill_bad_time_intervals() assert ts == ts_new + def test_event_like_small_gtis_eliminated(self): + ev_like_filt = copy.deepcopy(self.ev_like) + # I introduce a small gap in the GTIs + ev_like_filt.gti = np.asanyarray( + [[0, 498], [500, 520], [522, 700], [702, 900], [910, 910.1], [911, 911.1], [950, 1000]] + ) + ev_new = ev_like_filt.fill_bad_time_intervals(max_length=3, buffer_size=4) + + assert np.allclose(ev_new.gti, self.gti) + def test_event_like(self): ev_like_filt = copy.deepcopy(self.ev_like) # I introduce a small gap in the GTIs ev_like_filt.gti = np.asanyarray( [[0, 498], [500, 520], [522, 700], [702, 900], [950, 1000]] ) - ev_new = ev_like_filt.fill_bad_time_intervals() + ev_new = ev_like_filt.fill_bad_time_intervals(max_length=3, buffer_size=4) assert np.allclose(ev_new.gti, self.gti) @@ -1420,7 +1431,7 @@ def test_lc_like(self): lc_like_filt.gti = np.asanyarray( [[0, 498], [500, 520], [522, 700], [702, 900], [950, 1000]] ) - lc_new = lc_like_filt.fill_bad_time_intervals() + lc_new = lc_like_filt.fill_bad_time_intervals(max_length=3, buffer_size=4) assert np.allclose(lc_new.gti, self.gti) lc_like_gtifilt = self.lc_like.apply_gtis(inplace=False) @@ -1432,7 +1443,7 @@ def test_lc_like(self): lc_new.gti = lc_like_filt.gti new_masked, filt_masked = lc_new.apply_gtis(), lc_like_filt.apply_gtis() - for attr in ["time", "counts", "blablas"]: + for attr in ["time", "counts", "blublus"]: assert np.allclose(getattr(new_masked, attr), getattr(filt_masked, attr)) def test_ignore_attrs_ev_like(self): @@ -1457,8 +1468,8 @@ def test_ignore_attrs_lc_like(self): assert np.allclose(lc_new0.gti, lc_new1.gti) assert np.allclose(lc_new0.time, lc_new1.time) - assert np.count_nonzero(np.isnan(lc_new0.blablas)) == 0 - assert np.count_nonzero(np.isnan(lc_new1.blablas)) > 0 + assert np.count_nonzero(np.isnan(lc_new0.blublus)) == 0 + assert np.count_nonzero(np.isnan(lc_new1.blublus)) > 0 assert np.count_nonzero(np.isnan(lc_new1.counts)) == 0 def test_forcing_non_uniform(self): @@ -1478,7 +1489,7 @@ def test_forcing_uniform(self): # Results should be exactly the same lc_new0 = lc_like_filt.fill_bad_time_intervals(even_sampling=True, seed=201903) lc_new1 = lc_like_filt.fill_bad_time_intervals(even_sampling=None, seed=201903) - for attr in ["time", "counts", "blablas"]: + for attr in ["time", "counts", "blublus"]: assert np.allclose(getattr(lc_new0, attr), getattr(lc_new1, attr)) def test_bti_close_to_edge_event_like(self): @@ -1507,6 +1518,30 @@ def test_bti_close_to_edge_lc_like(self): lc_new = lc_like_filt.fill_bad_time_intervals() assert np.allclose(lc_new.gti, self.gti) + def test_fill_bti_values_event_like(self): + ev_like_filt = copy.deepcopy(self.ev_like) + # I introduce a small gap in the GTIs + ev_like_filt.gti = np.asanyarray([[0, 498], [500, 900], [950, 1000]]) + ev_new = ev_like_filt.fill_bad_time_intervals(seed=1234) + + # I check that the values in the filled GTI are not all the same + mask = (ev_new.time >= 498) & (ev_new.time <= 500) + new_ctrate = np.count_nonzero(mask) / 2.0 + assert np.isclose(new_ctrate, self.ctrate, rtol=0.1) + assert np.allclose(ev_new.blablas[mask], 100, atol=0.1) + + def test_fill_bti_values_lc_like(self): + lc_like_filt = copy.deepcopy(self.lc_like) + # I introduce a small gap in the GTIs + lc_like_filt.gti = np.asanyarray([[0, 498], [500, 900], [950, 1000]]) + lc_new = lc_like_filt.fill_bad_time_intervals(seed=1234) + + # I check that the values in the filled GTI are not all the same + mask = (lc_new.time > 498) & (lc_new.time < 500) + new_ctrate = np.mean(lc_new.counts[mask]) + assert np.isclose(new_ctrate, self.ctrate, rtol=0.1) + assert np.allclose(lc_new.blublus[mask], 100, atol=0.1) + class TestAnalyzeChunks(object): @classmethod