Class: Buhos::DuplicateAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/buhos/duplicate_analysis.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(cds) ⇒ DuplicateAnalysis

Returns a new instance of DuplicateAnalysis.



35
36
37
38
39
40
41
42
43
# File 'lib/buhos/duplicate_analysis.rb', line 35

def initialize(cds)
  begin
    require 'levenshtein-ffi'
  rescue LoadError
    require 'levenshtein'
  end

  @canonical_documents=cds
end

Instance Attribute Details

#canonical_documentsObject (readonly)

Returns the value of attribute canonical_documents.



34
35
36
# File 'lib/buhos/duplicate_analysis.rb', line 34

def canonical_documents
  @canonical_documents
end

Instance Method Details

#by_doiObject

Returns a list of repeated doi



45
46
47
# File 'lib/buhos/duplicate_analysis.rb', line 45

def by_doi
  canonical_documents.exclude(doi: nil).group_and_count(:doi).having {count.function.* > 1}.all.map {|v| v[:doi]}
end

#by_metadataObject

We will use a blocking method based on year. www.sciencedirect.com/science/article/pii/S1319157817304512

Returns:

  • array with pairs of duplicates



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/buhos/duplicate_analysis.rb', line 84

def 
  dups=[]
  v=canonical_documents.to_hash_groups(:year)
  v.each do |r1,r2|
    n=r2.length
    0.upto(n-2) do |i|
      (i+1).upto(n-1) do |j|
        cd1= r2[i]
        cd2= r2[j]
        if (cd_is_identical(cd1,cd2) or cd_is_very_similar(cd1,cd2)) and (cd_dois_arent_different(cd1,cd2))
          dups.push [cd1[:id],cd2[:id]].sort
        end

      end

    end


  end
  dups.sort {|a,b| a[0]<=>b[0]}
end

#by_pubmed_idObject



57
58
59
# File 'lib/buhos/duplicate_analysis.rb', line 57

def by_pubmed_id
  canonical_documents.exclude(pubmed_id: nil).group_and_count(:pubmed_id).having {count.function.* > 1}.all.map {|v| v[:pubmed_id]}
end

#by_scielo_idObject



54
55
56
# File 'lib/buhos/duplicate_analysis.rb', line 54

def by_scielo_id
  canonical_documents.exclude(scielo_id: nil).group_and_count(:scielo_id).having {count.function.* > 1}.all.map {|v| v[:scielo_id]}
end

#by_scopus_idObject



48
49
50
# File 'lib/buhos/duplicate_analysis.rb', line 48

def by_scopus_id
  canonical_documents.exclude(scopus_id: nil).group_and_count(:scopus_id).having {count.function.* > 1}.all.map {|v| v[:scopus_id]}
end

#by_wos_idObject



51
52
53
# File 'lib/buhos/duplicate_analysis.rb', line 51

def by_wos_id
  canonical_documents.exclude(wos_id: nil).group_and_count(:wos_id).having {count.function.* > 1}.all.map {|v| v[:wos_id]}
end

#cd_dois_arent_different(cd1, cd2) ⇒ Object



65
66
67
# File 'lib/buhos/duplicate_analysis.rb', line 65

def cd_dois_arent_different(cd1,cd2)
   !(!cd1[:doi].nil? and !cd2[:doi].nil? and cd1[:doi]!=cd2[:doi])
end

#cd_is_identical(cd1, cd2) ⇒ Object



61
62
63
# File 'lib/buhos/duplicate_analysis.rb', line 61

def cd_is_identical(cd1,cd2)
  cd1[:title]==cd2[:title] and cd1[:year]==cd2[:year] and (cd1[:journal].nil? or (cd1[:journal]==cd2[:journal] and cd1[:pages]==cd2[:pages]))
end

#cd_is_very_similar(cd1, cd2) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/buhos/duplicate_analysis.rb', line 68

def cd_is_very_similar(cd1,cd2)
  t1="#{cd1[:year]} #{cd1[:title]} #{cd1[:authors]} #{cd1[:journal]} #{cd1[:pages]}".gsub(/[^A-Za-z\d\s]/,"").downcase
  t2="#{cd2[:year]} #{cd2[:title]} #{cd2[:authors]} #{cd2[:journal]} #{cd2[:pages]}".gsub(/[^A-Za-z\d\s]/,"").downcase
  if(t1.length>10)
    d=Levenshtein.distance(t1,t2)
    d<5
    #t1==t2
  else
    false

  end
end