Class: Buhos::DuplicateAnalysis
- Inherits:
-
Object
- Object
- Buhos::DuplicateAnalysis
- Defined in:
- lib/buhos/duplicate_analysis.rb
Instance Attribute Summary collapse
-
#canonical_documents ⇒ Object
readonly
Returns the value of attribute canonical_documents.
Instance Method Summary collapse
-
#by_doi ⇒ Object
Returns a list of repeated doi.
-
#by_metadata ⇒ Object
We will use a blocking method based on year.
- #by_pubmed_id ⇒ Object
- #by_scielo_id ⇒ Object
- #by_scopus_id ⇒ Object
- #by_wos_id ⇒ Object
- #cd_dois_arent_different(cd1, cd2) ⇒ Object
- #cd_is_identical(cd1, cd2) ⇒ Object
- #cd_is_very_similar(cd1, cd2) ⇒ Object
-
#initialize(cds) ⇒ DuplicateAnalysis
constructor
A new instance of DuplicateAnalysis.
Constructor Details
#initialize(cds) ⇒ DuplicateAnalysis
Returns a new instance of DuplicateAnalysis.
35 36 37 38 39 40 41 42 43 |
# File 'lib/buhos/duplicate_analysis.rb', line 35 def initialize(cds) begin require 'levenshtein-ffi' rescue LoadError require 'levenshtein' end @canonical_documents=cds end |
Instance Attribute Details
#canonical_documents ⇒ Object (readonly)
Returns the value of attribute canonical_documents.
34 35 36 |
# File 'lib/buhos/duplicate_analysis.rb', line 34 def canonical_documents @canonical_documents end |
Instance Method Details
#by_doi ⇒ Object
Returns a list of repeated doi
45 46 47 |
# File 'lib/buhos/duplicate_analysis.rb', line 45 def by_doi canonical_documents.exclude(doi: nil).group_and_count(:doi).having {count.function.* > 1}.all.map {|v| v[:doi]} end |
#by_metadata ⇒ Object
We will use a blocking method based on year. www.sciencedirect.com/science/article/pii/S1319157817304512
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/buhos/duplicate_analysis.rb', line 84 def dups=[] v=canonical_documents.to_hash_groups(:year) v.each do |r1,r2| n=r2.length 0.upto(n-2) do |i| (i+1).upto(n-1) do |j| cd1= r2[i] cd2= r2[j] if (cd_is_identical(cd1,cd2) or cd_is_very_similar(cd1,cd2)) and (cd_dois_arent_different(cd1,cd2)) dups.push [cd1[:id],cd2[:id]].sort end end end end dups.sort {|a,b| a[0]<=>b[0]} end |
#by_pubmed_id ⇒ Object
57 58 59 |
# File 'lib/buhos/duplicate_analysis.rb', line 57 def by_pubmed_id canonical_documents.exclude(pubmed_id: nil).group_and_count(:pubmed_id).having {count.function.* > 1}.all.map {|v| v[:pubmed_id]} end |
#by_scielo_id ⇒ Object
54 55 56 |
# File 'lib/buhos/duplicate_analysis.rb', line 54 def by_scielo_id canonical_documents.exclude(scielo_id: nil).group_and_count(:scielo_id).having {count.function.* > 1}.all.map {|v| v[:scielo_id]} end |
#by_scopus_id ⇒ Object
48 49 50 |
# File 'lib/buhos/duplicate_analysis.rb', line 48 def by_scopus_id canonical_documents.exclude(scopus_id: nil).group_and_count(:scopus_id).having {count.function.* > 1}.all.map {|v| v[:scopus_id]} end |
#by_wos_id ⇒ Object
51 52 53 |
# File 'lib/buhos/duplicate_analysis.rb', line 51 def by_wos_id canonical_documents.exclude(wos_id: nil).group_and_count(:wos_id).having {count.function.* > 1}.all.map {|v| v[:wos_id]} end |
#cd_dois_arent_different(cd1, cd2) ⇒ Object
65 66 67 |
# File 'lib/buhos/duplicate_analysis.rb', line 65 def cd_dois_arent_different(cd1,cd2) !(!cd1[:doi].nil? and !cd2[:doi].nil? and cd1[:doi]!=cd2[:doi]) end |
#cd_is_identical(cd1, cd2) ⇒ Object
61 62 63 |
# File 'lib/buhos/duplicate_analysis.rb', line 61 def cd_is_identical(cd1,cd2) cd1[:title]==cd2[:title] and cd1[:year]==cd2[:year] and (cd1[:journal].nil? or (cd1[:journal]==cd2[:journal] and cd1[:pages]==cd2[:pages])) end |
#cd_is_very_similar(cd1, cd2) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/buhos/duplicate_analysis.rb', line 68 def cd_is_very_similar(cd1,cd2) t1="#{cd1[:year]} #{cd1[:title]} #{cd1[:authors]} #{cd1[:journal]} #{cd1[:pages]}".gsub(/[^A-Za-z\d\s]/,"").downcase t2="#{cd2[:year]} #{cd2[:title]} #{cd2[:authors]} #{cd2[:journal]} #{cd2[:pages]}".gsub(/[^A-Za-z\d\s]/,"").downcase if(t1.length>10) d=Levenshtein.distance(t1,t2) d<5 #t1==t2 else false end end |