diff options
Diffstat (limited to 'rushs/data-clash/step-3')
| -rw-r--r-- | rushs/data-clash/step-3/napoleon_analysis.sql | 46 | ||||
| -rw-r--r-- | rushs/data-clash/step-3/napoleon_connections.sql | 29 | ||||
| -rw-r--r-- | rushs/data-clash/step-3/napoleon_filter.sql | 30 | ||||
| -rw-r--r-- | rushs/data-clash/step-3/napoleon_synthesis.sql | 43 | ||||
| -rw-r--r-- | rushs/data-clash/step-3/napoleon_timeline.sql | 85 |
5 files changed, 233 insertions, 0 deletions
diff --git a/rushs/data-clash/step-3/napoleon_analysis.sql b/rushs/data-clash/step-3/napoleon_analysis.sql new file mode 100644 index 0000000..cdbb4bd --- /dev/null +++ b/rushs/data-clash/step-3/napoleon_analysis.sql @@ -0,0 +1,46 @@ +SELECT + id, + right(location, 7) AS location, + end_time - start_time AS duration, + left(details, 10) AS details, + CASE + WHEN + NOT regexp_like(details, '^([[:ascii:]])*$') + THEN 'invalid_characters' + WHEN + start_time > end_time OR location NOT LIKE '%MMT' + THEN 'improbable_values' + WHEN start_time IS NULL OR end_time IS NULL THEN 'missing_timestamp' + WHEN + location IS NULL OR latitude IS NULL OR longitude IS NULL + THEN 'missing_location' + WHEN + EXISTS ( + SELECT * + FROM napoleon_data.public_reports AS t + WHERE + r.id > t.id + AND r.location = t.location + AND r.start_time = t.start_time + AND r.end_time = t.end_time + AND r.details = t.details + ) + THEN 'duplicate_report' + END AS issue +FROM napoleon_data.public_reports AS r +WHERE + NOT regexp_like(details, '^([[:ascii:]])*$') + OR start_time IS NULL OR end_time IS NULL + OR start_time > end_time + OR location NOT LIKE '%MMT' + OR location IS NULL OR latitude IS NULL OR longitude IS NULL + OR EXISTS ( + SELECT * + FROM napoleon_data.public_reports AS t + WHERE + r.id > t.id + AND r.location = t.location + AND r.start_time = t.start_time + AND r.end_time = t.end_time + AND r.details = t.details + ) diff --git a/rushs/data-clash/step-3/napoleon_connections.sql b/rushs/data-clash/step-3/napoleon_connections.sql new file mode 100644 index 0000000..170f635 --- /dev/null +++ b/rushs/data-clash/step-3/napoleon_connections.sql @@ -0,0 +1,29 @@ +SELECT + t.id AS individual_id, + t.name, + t.role, + t.affiliation AS current_affiliation, + p.affiliation AS historical_affiliation +FROM surveillance_data.current_targets AS t +INNER JOIN napoleon_data.historical_people AS p + ON + t.name = p.name + OR t.affiliation != 'N/A' +WHERE + EXISTS ( + SELECT 1 FROM (SELECT + a1.group_name_1, + a2.group_name_2 + FROM surveillance_data.group_affinities AS a1, + surveillance_data.group_affinities AS a2 + WHERE a1.group_name_2 = a2.group_name_1 + UNION + SELECT + group_name_1, + group_name_2 + FROM surveillance_data.group_affinities) AS test + WHERE + (group_name_1 = p.affiliation AND t.affiliation = group_name_2) + OR (group_name_1 = t.affiliation AND group_name_2 = p.affiliation) + ) + OR t.affiliation = p.affiliation diff --git a/rushs/data-clash/step-3/napoleon_filter.sql b/rushs/data-clash/step-3/napoleon_filter.sql new file mode 100644 index 0000000..797f085 --- /dev/null +++ b/rushs/data-clash/step-3/napoleon_filter.sql @@ -0,0 +1,30 @@ +WITH proximity_filtered AS ( + SELECT DISTINCT ON (r.id) + r.id, + r.location, + r.details, + r.start_time, + MIN(6371 * ACOS( + COS(RADIANS(r.latitude)) * COS(RADIANS(l.latitude)) + * COS(RADIANS(l.longitude) - RADIANS(r.longitude)) + + SIN(RADIANS(r.latitude)) * SIN(RADIANS(l.latitude)) + )) AS distance + FROM napoleon_data.public_reports AS r, + napoleon_data.critical_locations AS l + WHERE + 300 >= (6371 * ACOS( + COS(RADIANS(r.latitude)) * COS(RADIANS(l.latitude)) + * COS(RADIANS(l.longitude) - RADIANS(r.longitude)) + + SIN(RADIANS(r.latitude)) * SIN(RADIANS(l.latitude)) + )) + AND details ILIKE '%Napoleon%' + AND start_time BETWEEN '2069-01-01' AND '2070-12-31' + GROUP BY r.id, r.location, r.details, r.start_time +) + +SELECT + id, + location, + distance +FROM proximity_filtered +ORDER BY distance, id diff --git a/rushs/data-clash/step-3/napoleon_synthesis.sql b/rushs/data-clash/step-3/napoleon_synthesis.sql new file mode 100644 index 0000000..f71dd09 --- /dev/null +++ b/rushs/data-clash/step-3/napoleon_synthesis.sql @@ -0,0 +1,43 @@ +CREATE VIEW napoleon_affair_view AS +WITH count_report AS ( + SELECT + location, + COUNT(*) AS total_reports + FROM napoleon_data.public_reports + GROUP BY location +), +persons_mention AS ( + SELECT + location, + name AS person, + count(*) AS occ + FROM napoleon_data.public_reports + INNER JOIN napoleon_data.historical_people + ON details LIKE concat('%', name, '%') + GROUP BY location, name +), +most_mentioned AS ( + SELECT + location, + person AS most_mentioned_person, + occ AS occurrences_most_mentioned_person, + row_number() OVER (PARTITION BY location ORDER BY occ DESC) AS rn + FROM persons_mention +), +unique_people_count AS ( + SELECT + location, + COUNT(DISTINCT person) AS total_people_mentioned + FROM persons_mention + GROUP BY location +) + +SELECT + rc.location, + total_reports, + total_people_mentioned, + most_mentioned_person, + occurrences_most_mentioned_person +FROM count_report AS rc +INNER JOIN unique_people_count AS uc ON rc.location = uc.location +INNER JOIN most_mentioned AS mmmm ON mmmm.location = rc.location AND mmmm.rn = 1; diff --git a/rushs/data-clash/step-3/napoleon_timeline.sql b/rushs/data-clash/step-3/napoleon_timeline.sql new file mode 100644 index 0000000..eff8be1 --- /dev/null +++ b/rushs/data-clash/step-3/napoleon_timeline.sql @@ -0,0 +1,85 @@ +WITH rapports_valides AS ( + SELECT + id, + location, + start_time, + end_time, + latitude, + longitude + FROM napoleon_data.public_reports + WHERE location IS NOT NULL + AND start_time IS NOT NULL + AND end_time IS NOT NULL + AND latitude IS NOT NULL + AND longitude IS NOT NULL +), +rapports_ordonnes AS ( + SELECT + id, + location, + start_time, + end_time, + latitude, + longitude, + LAG(id) OVER (ORDER BY start_time) AS pid, + LAG(location) OVER (ORDER BY start_time) AS ploc, + LAG(start_time) OVER (ORDER BY start_time) AS pstart, + LAG(end_time) OVER (ORDER BY start_time) AS pend, + LAG(latitude) OVER (ORDER BY start_time) AS plat, + LAG(longitude) OVER (ORDER BY start_time) AS plong + FROM rapports_valides +), +diffrapports AS ( + SELECT + id, + location, + start_time, + end_time, + latitude, + longitude, + pid, + ploc, + pstart, + pend, + plat, + plong, + extract(EPOCH from (start_time - pend)) AS tdiff, + (6371 * ACOS( + COS(RADIANS(latitude)) * COS(RADIANS(plat)) + * COS(RADIANS(plong) - RADIANS(longitude)) + + SIN(RADIANS(latitude)) * SIN(RADIANS(plat)) + )) AS d + FROM rapports_ordonnes +), +analyse_rapports AS ( + SELECT + id, + location, + start_time, + end_time, + latitude, + longitude, + pid, + ploc, + pstart, + pend, + plat, + plong, + tdiff, + d, + CASE + WHEN start_time < pend THEN 'overlapping_timeframes' + WHEN tdiff < 7200 AND d > 500 THEN 'moving_fast' + WHEN extract(EPOCH from (end_time - start_time)) > 86400 THEN 'long_duration' + WHEN d IS NULL OR tdiff IS NULL THEN 'missing_data' + WHEN d > 500 THEN 'long_distance' + ELSE 'clear' + END AS issue + FROM diffrapports +) +SELECT + tdiff AS time_difference, + d AS distance, + issue +FROM analyse_rapports +ORDER BY start_time |
