إزالة التكرارات مع التحذيرات
-
02-07-2019 - |
سؤال
لدي جدول يحتوي على معرف الصف وخط الطول وخط العرض واسم العمل وعنوان url والتسمية التوضيحية.قد يبدو هذا كالتالي:
rowID | long | lat | businessName | url | caption
1 20 -20 Pizza Hut yum.com null
كيف يمكنني حذف جميع التكرارات، مع الاحتفاظ فقط بالنسخة التي تحتوي على عنوان URL (الأولوية الأولى)، أو الاحتفاظ بالنسخة التي تحتوي على تسمية توضيحية إذا لم يكن لدى الأخرى عنوان URL (الأولوية الثانية) وحذف الباقي؟
المحلول
ها هي تقنية التكرار الخاصة بي.من المحتمل أن يتم التصويت على هذا لأنه ليس الاتجاه السائد - وأنا مرتاح لذلك.
DECLARE @LoopVar int
DECLARE
@long int,
@lat int,
@businessname varchar(30),
@winner int
SET @LoopVar = (SELECT MIN(rowID) FROM Locations)
WHILE @LoopVar is not null
BEGIN
--initialize the variables.
SELECT
@long = null,
@lat = null,
@businessname = null,
@winner = null
-- load data from the known good row.
SELECT
@long = long,
@lat = lat,
@businessname = businessname
FROM Locations
WHERE rowID = @LoopVar
--find the winning row with that data
SELECT top 1 @Winner = rowID
FROM Locations
WHERE @long = long
AND @lat = lat
AND @businessname = businessname
ORDER BY
CASE WHEN URL is not null THEN 1 ELSE 2 END,
CASE WHEN Caption is not null THEN 1 ELSE 2 END,
RowId
--delete any losers.
DELETE FROM Locations
WHERE @long = long
AND @lat = lat
AND @businessname = businessname
AND @winner != rowID
-- prep the next loop value.
SET @LoopVar = (SELECT MIN(rowID) FROM Locations WHERE @LoopVar < rowID)
END
نصائح أخرى
تم تقديم هذا الحل لك من خلال "الأشياء التي تعلمتها في Stack Overflow" في الأسبوع الماضي:
DELETE restaurant
WHERE rowID in
(SELECT rowID
FROM restaurant
EXCEPT
SELECT rowID
FROM (
SELECT rowID, Rank() over (Partition BY BusinessName, lat, long ORDER BY url DESC, caption DESC ) AS Rank
FROM restaurant
) rs WHERE Rank = 1)
تحذير:لم أختبر هذا على قاعدة بيانات حقيقية
الحل القائم على المجموعة:
delete from T as t1
where /* delete if there is a "better" row
with same long, lat and businessName */
exists(
select * from T as t2 where
t1.rowID <> t2.rowID
and t1.long = t2.long
and t1.lat = t2.lat
and t1.businessName = t2.businessName
and
case when t1.url is null then 0 else 4 end
/* 4 points for non-null url */
+ case when t1.businessName is null then 0 else 2 end
/* 2 points for non-null businessName */
+ case when t1.rowID > t2.rowId then 0 else 1 end
/* 1 point for having smaller rowId */
<
case when t2.url is null then 0 else 4 end
+ case when t2.businessName is null then 0 else 2 end
)
delete MyTable
from MyTable
left outer join (
select min(rowID) as rowID, long, lat, businessName
from MyTable
where url is not null
group by long, lat, businessName
) as HasUrl
on MyTable.long = HasUrl.long
and MyTable.lat = HasUrl.lat
and MyTable.businessName = HasUrl.businessName
left outer join (
select min(rowID) as rowID, long, lat, businessName
from MyTable
where caption is not null
group by long, lat, businessName
) HasCaption
on MyTable.long = HasCaption.long
and MyTable.lat = HasCaption.lat
and MyTable.businessName = HasCaption.businessName
left outer join (
select min(rowID) as rowID, long, lat, businessName
from MyTable
where url is null
and caption is null
group by long, lat, businessName
) HasNone
on MyTable.long = HasNone.long
and MyTable.lat = HasNone.lat
and MyTable.businessName = HasNone.businessName
where MyTable.rowID <>
coalesce(HasUrl.rowID, HasCaption.rowID, HasNone.rowID)
تشبه إجابة أخرى، لكنك تريد الحذف بناءً على رقم الصف بدلاً من الترتيب.امزج مع تعبيرات الجدول الشائعة أيضًا:
;WITH GroupedRows AS
( SELECT rowID, Row_Number() OVER (Partition BY BusinessName, lat, long ORDER BY url DESC, caption DESC) rowNum
FROM restaurant
)
DELETE r
FROM restaurant r
JOIN GroupedRows gr ON r.rowID = gr.rowID
WHERE gr.rowNum > 1
إذا كان ذلك ممكنا، هل يمكنك التجانس، ثم إزالة التكرارات؟
الخطوة 1:
UPDATE BusinessLocations
SET BusinessLocations.url = LocationsWithUrl.url
FROM BusinessLocations
INNER JOIN (
SELECT long, lat, businessName, url, caption
FROM BusinessLocations
WHERE url IS NOT NULL) LocationsWithUrl
ON BusinessLocations.long = LocationsWithUrl.long
AND BusinessLocations.lat = LocationsWithUrl.lat
AND BusinessLocations.businessName = LocationsWithUrl.businessName
UPDATE BusinessLocations
SET BusinessLocations.caption = LocationsWithCaption.caption
FROM BusinessLocations
INNER JOIN (
SELECT long, lat, businessName, url, caption
FROM BusinessLocations
WHERE caption IS NOT NULL) LocationsWithCaption
ON BusinessLocations.long = LocationsWithCaption.long
AND BusinessLocations.lat = LocationsWithCaption.lat
AND BusinessLocations.businessName = LocationsWithCaption.businessName
الخطوة 2:إزالة التكرارات.