-- The CTE1 is just to create columns that dictate the bound of what is considered the same entry-- Also I do a dense rank by ACT_TIME, and a PARITION BY on ID1, ID2-- so all ID1/ID2 combos are ranked by when they ranWITH cte1 AS (SELECT *, (X-2) AS X_START, (X+2) AS X_END, (Y-2) AS Y_START, (Y+2) AS Y_END, (Z*1.2) AS Z_MAX, DENSE_RANK() OVER (PARTITION BY ID1, ID2 ORDER BY ACT_TIME) AS DENSE_RANK FROM data ORDER BY data.ACT_TIME),cte2 AS (-- Create new set of column as comparisonsSELECT ID AS ID_COMP,ID1 AS ID1_COMP,ID2 AS ID2_COMP,X_START AS X_START_COMP,X_END AS X_END_COMP,Y_START AS Y_START_COMP,Y_END AS Y_END_COMP,Z AS Z_MAX_COMP,DENSE_RANK AS DENSE_RANK_COMPFROM cte1), cte3 AS (-- join cte1 on cte2 only when X and Y value from cte1 is between the limits of cte2 AND -- Z max value from cte 2 is larger than Z value from cte1 AND ID1/ID2 match-- The result will have an ID of a row that should be removed since their x and y was in between the limits -- Then remove any rows where rank from cte2 is higher than cte1-- Remove any rows that were joined onto it self SELECT cte1.* ,cte2.*FROM cte1JOIN cte2 ON (( cte2.X_END_COMP >= cte1.X AND cte1.X >= cte2.X_START_COMP) AND (cte2.Y_END_COMP >= cte1.Y AND cte1.Y>= cte2.Y_START_COMP) AND (cte1.Z < cte2.Z_MAX_COMP) AND (cte2.ID2_COMP = cte1.ID2) AND (cte2.ID1_COMP = cte1.ID1))WHERE cte1.ID <> cte2.ID_COMP AND cte2.DENSE_RANK_COMP <= cte1.DENSE_RANK)-- Any IDs that shows up in cte3 remove from the final resultSELECT data.*FROM dataWHERE ID NOT IN (SELECT DISTINCT ID FROM cte3)ORDER BY data.ACT_TIME
Here's my create table
CREATE TABLE `data` ( `ID` INT(11) NOT NULL AUTO_INCREMENT, `ID1` VARCHAR(10) NULL DEFAULT NULL COLLATE 'latin1_swedish_ci', `ID2` INT(11) NULL DEFAULT NULL, `ACT_TIME` TIMESTAMP NULL DEFAULT NULL, `X` FLOAT(12) NULL DEFAULT NULL, `Y` FLOAT(12) NULL DEFAULT NULL, `Z` FLOAT(12) NULL DEFAULT NULL, PRIMARY KEY (`ID`) USING BTREE, INDEX `ID1` (`ID1`) USING BTREE, INDEX `ACT_TIME` (`ACT_TIME`) USING BTREE);
Here's the EXPLAIN {query} result
Here's how the query should work.
I want to remove any rows (that have the same ID1 and ID2) that occur later on where X and Y are in between +-2 and a Z less than 1.2*z
Here's an example input
Example output
This query takes about 5min with 2.5M rows.
I am on MariDB 10.5.5
Any and all help is appreciated!
EDIT for Rick James here's your explain {query}here is the explain result