I am creating a query to obtain a list of emails from a dataset that is contaminated with invalid email formats, I have a query with many filters but there are many that cannot obtain the filters and when using the programmed query it generates an error. ingestion into the dataset. (Format validation for field :personalEmail.address of type: String failed. The input value is daimer100@gmail.conm and expected format was: email. Please update the field and try again.) The list of errors is quite extensive and varied, not I see it viable to do this through query filters, how else could I do this query?
QUERY:
WITH valid_emails AS (
SELECT
uuid() AS _id,
CASE
WHEN identityMap['email'][0]['id'] LIKE '%_@__%.__%'
AND identityMap['email'][0]['id'] ~ '^[a-zA-Z0-9.!#$%&''*+/=?^_`{|}~-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}$' -- Validación RFC2822
AND NOT (identityMap['email'][0]['id'] LIKE '%..%') -- No contiene puntos consecutivos
AND NOT (identityMap['email'][0]['id'] LIKE '%.@%') -- No contiene un punto antes de la arroba
AND NOT (identityMap['email'][0]['id'] LIKE '%.%@%') -- No contiene un punto antes o después de la arroba
AND NOT (identityMap['email'][0]['id'] LIKE '%@%.') -- No contiene un punto después de la arroba
AND NOT (identityMap['email'][0]['id'] LIKE '%.con') -- No termina en .con
AND NOT (identityMap['email'][0]['id'] LIKE '%/%') -- No contiene una barra inclinada
THEN named_struct('address', identityMap['email'][0]['id'])
WHEN identityMap['email'][1]['id'] LIKE '%_@__%.__%'
AND identityMap['email'][1]['id'] ~ '^[a-zA-Z0-9.!#$%&''*+/=?^_`{|}~-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}$' -- Validación RFC2822
AND NOT (identityMap['email'][1]['id'] LIKE '%..%') -- No contiene puntos consecutivos
AND NOT (identityMap['email'][1]['id'] LIKE '%.@%') -- No contiene un punto antes de la arroba
AND NOT (identityMap['email'][1]['id'] LIKE '%.%@%') -- No contiene un punto antes o después de la arroba
AND NOT (identityMap['email'][1]['id'] LIKE '%@%.') -- No contiene un punto después de la arroba
AND NOT (identityMap['email'][1]['id'] LIKE '%.con') -- No termina en .con
AND NOT (identityMap['email'][1]['id'] LIKE '%/%') -- No contiene una barra inclinada
THEN named_struct('address', identityMap['email'][1]['id'])
WHEN identityMap['email'][2]['id'] LIKE '%_@__%.__%'
AND identityMap['email'][2]['id'] ~ '^[a-zA-Z0-9.!#$%&''*+/=?^_`{|}~-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}$' -- Validación RFC2822
AND NOT (identityMap['email'][2]['id'] LIKE '%..%') -- No contiene puntos consecutivos
AND NOT (identityMap['email'][2]['id'] LIKE '%.@%') -- No contiene un punto antes de la arroba
AND NOT (identityMap['email'][2]['id'] LIKE '%.%@%') -- No contiene un punto antes o después de la arroba
AND NOT (identityMap['email'][2]['id'] LIKE '%@%.') -- No contiene un punto después de la arroba
AND NOT (identityMap['email'][2]['id'] LIKE '%.con') -- No termina en .con
AND NOT (identityMap['email'][2]['id'] LIKE '%/%') -- No contiene una barra inclinada
THEN named_struct('address', identityMap['email'][2]['id'])
-- Agrega más condiciones si hay más posiciones
ELSE NULL
END AS personalEmail
FROM
profile_snapshot_export_*profile_snapshot_number*
WHERE
segmentmembership['ups']['*segmentmembership NUMBER*'].status IN ('realized', 'existing')
AND (identityMap['email'][0]['id'] IS NOT NULL
OR identityMap['email'][1]['id'] IS NOT NULL
OR identityMap['email'][2]['id'] IS NOT NULL -- Agrega más condiciones si hay más posiciones
)
)
SELECT
_id,
personalEmail
FROM
valid_emails
WHERE
personalEmail IS NOT null;