我们在oracle pl/sql中有以下代码,它们使用Jaro-Winkler计算两个字符串之间的相似度。我们尝试做的是,根据两个字符串的相似度来找到任何dup。我们的用例是用户可以输入用户信息,例如first_name,last_name,但有打字错误,我们唯一可以使用的密钥是first_name,last_name。没有像SSN或电子邮件这样的唯一标识符来标识用户。因此,我们的想法是在first_name和last_name上进行自连接,然后获得相似度得分,并在此基础上识别dup。
然而,即使有10,000个用户,也会有100,000,000个操作来进行匹配,我们在oracle db中尝试这样做,但它太慢了。
我们是google-bigquery或Amazon Red-shift的新手。有没有关于如何在我们的数据集中实现自定义函数的教程。
或者google-bigquery或Amazon Red Shift已经有了类似oracle的解决方案??
我们目前的环境不适合做这个概念验证,所以我们喜欢在云中做这个练习。
谢谢你的帮助。
--http://www.orafaq.com/forum/t/164224/
CREATE OR REPLACE FUNCTION GKN_COMMON.jws -- Jaro-Winkler similarity
(p_string1 IN VARCHAR2,
p_string2 IN VARCHAR2)
RETURN NUMBER
DETERMINISTIC
AS
v_string1 VARCHAR2 (32767);
v_string2 VARCHAR2 (32767);
v_closeness NUMBER := 0;
v_temp VARCHAR2 (32767);
v_comp1 VARCHAR2 (32767);
v_comp2 VARCHAR2 (32767);
v_matches NUMBER := 0;
v_char VARCHAR2 (1);
v_transpositions NUMBER := 0;
v_d_jaro NUMBER := 0;
v_leading NUMBER := 0;
v_d_winkler NUMBER := 0;
v_jws NUMBER := 0;
BEGIN
-- check for null strings:
IF p_string1 IS NULL OR p_string2 IS NULL THEN
RETURN 0;
END IF;
-- remove accents:
v_string1 := translate (p_string1,
'?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??',
'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
v_string2 := translate (p_string2,
'?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??',
'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
-- closeness:
v_closeness := (GREATEST (LENGTH (v_string1), LENGTH (v_string2)) / 2) - 1;
-- find matching characters and transpositions within closeness:
v_temp := v_string2;
FOR i IN 1 .. LENGTH (v_string1) LOOP
IF INSTR (v_temp, SUBSTR (v_string1, i, 1)) > 0 THEN
v_char := SUBSTR (v_string1, i, 1);
IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
v_comp1 := v_comp1 || SUBSTR (v_string1, i, 1);
v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string1, i, 1)) - 1)
|| SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string1, i, 1)) + 1);
END IF;
END IF;
END LOOP;
v_temp := v_string1;
FOR i IN 1 .. LENGTH (v_string2) LOOP
IF INSTR (v_temp, SUBSTR (v_string2, i, 1)) > 0 THEN
v_char := SUBSTR (v_string2, i, 1);
IF ABS (INSTR (v_string2, v_char) - INSTR (v_string1, v_char)) <= v_closeness THEN
v_comp2 := v_comp2 || SUBSTR (v_string2, i, 1);
v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string2, i, 1)) - 1)
|| SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string2, i, 1)) + 1);
END IF;
END IF;
END LOOP;
-- check for null strings:
IF v_comp1 IS NULL OR v_comp2 IS NULL THEN
RETURN 0;
END IF;
-- count matches and transpositions within closeness:
FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
v_matches := v_matches + 1;
ELSE
v_char := SUBSTR (v_comp1, i, 1);
IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
v_transpositions := v_transpositions + 1;
v_matches := v_matches + 1;
END IF;
END IF;
END LOOP;
v_transpositions := v_transpositions / 2;
-- check for no matches:
IF v_matches = 0
THEN RETURN 0;
END IF;
-- Jaro:
v_d_jaro := ((v_matches / LENGTH (v_string1)) +
(v_matches / LENGTH (v_string2)) +
((v_matches - v_transpositions) / v_matches))
/ 3;
-- count matching leading characters (up to 4):
FOR i IN 1 .. LEAST (LENGTH (v_string1), LENGTH (v_string2), 4) LOOP
IF SUBSTR (v_string1, i, 1) = SUBSTR (v_string2, i, 1) THEN
v_leading := v_leading + 1;
ELSE
EXIT;
END IF;
END LOOP;
-- Winkler:
v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
-- Jaro-Winkler similarity rounded:
v_jws := ROUND (v_d_winkler * 100);
RETURN v_jws;
END jws;
WITH
strings AS
(SELECT NULL string1, NULL string2 FROM DUAL UNION ALL
SELECT 'test' string1, NULL string2 FROM DUAL UNION ALL
SELECT NULL string1, 'test' string2 FROM DUAL UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 FROM DUAL UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 FROM DUAL UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 FROM DUAL UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 FROM DUAL UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 FROM DUAL UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 FROM DUAL UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 FROM DUAL UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
SELECT 'test' string1, 'blank' string2 FROM DUAL UNION ALL
SELECT 'everybody' string1, 'every' string2 FROM DUAL UNION ALL
SELECT 'a' string1, 'aaa' string2 FROM DUAL UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 FROM DUAL UNION ALL
SELECT 'ça' string1, 'ca' string2 FROM DUAL UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2 FROM DUAL)
SELECT string1, string2,
--UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
jws (string1, string2) my_jws
FROM strings
ORDER BY my_jws DESC发布于 2016-10-04 08:36:39
检查下面的示例
它用于具有标准SQL的BigQuery (请检查Enabling Standard SQL),并使用JS User-Defined Functions
CREATE TEMPORARY FUNCTION similariry(Name1 STRING, Name2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Name1;
try {
the_Name1 = decodeURI(Name1).toLowerCase();
} catch (ex) {
the_Name1 = Name1.toLowerCase();
}
try {
the_Name2 = decodeURI(Name2).toLowerCase();
} catch (ex) {
the_Name2 = Name2.toLowerCase();
}
return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
""";
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
)
SELECT string1, string2, similariry(string1, string2) my_sim
FROM strings
ORDER BY my_sim DESChttps://stackoverflow.com/questions/39841678
复制相似问题