Class: Buhos::DuplicateAnalysis
- Inherits:
 - 
      Object
      
        
- Object
 - Buhos::DuplicateAnalysis
 
 
- Defined in:
 - lib/buhos/duplicate_analysis.rb
 
Instance Attribute Summary collapse
- 
  
    
      #canonical_documents  ⇒ Object 
    
    
  
  
  
  
    
      readonly
    
    
  
  
  
  
  
  
    
Returns the value of attribute canonical_documents.
 
Instance Method Summary collapse
- 
  
    
      #by_doi  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
Returns a list of repeated doi.
 - 
  
    
      #by_metadata  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
We will use a blocking method based on year.
 - #by_pubmed_id ⇒ Object
 - #by_scielo_id ⇒ Object
 - #by_scopus_id ⇒ Object
 - #by_wos_id ⇒ Object
 - #cd_dois_arent_different(cd1, cd2) ⇒ Object
 - #cd_is_identical(cd1, cd2) ⇒ Object
 - #cd_is_very_similar(cd1, cd2) ⇒ Object
 - 
  
    
      #initialize(cds)  ⇒ DuplicateAnalysis 
    
    
  
  
  
    constructor
  
  
  
  
  
  
  
    
A new instance of DuplicateAnalysis.
 
Constructor Details
#initialize(cds) ⇒ DuplicateAnalysis
Returns a new instance of DuplicateAnalysis.
      35 36 37 38 39 40 41 42 43  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 35 def initialize(cds) begin require 'levenshtein-ffi' rescue LoadError require 'levenshtein' end @canonical_documents=cds end  | 
  
Instance Attribute Details
#canonical_documents ⇒ Object (readonly)
Returns the value of attribute canonical_documents.
      34 35 36  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 34 def canonical_documents @canonical_documents end  | 
  
Instance Method Details
#by_doi ⇒ Object
Returns a list of repeated doi
      45 46 47  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 45 def by_doi canonical_documents.exclude(doi: nil).group_and_count(:doi).having {count.function.* > 1}.all.map {|v| v[:doi]} end  | 
  
#by_metadata ⇒ Object
We will use a blocking method based on year. www.sciencedirect.com/science/article/pii/S1319157817304512
      84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 84 def dups=[] v=canonical_documents.to_hash_groups(:year) v.each do |r1,r2| n=r2.length 0.upto(n-2) do |i| (i+1).upto(n-1) do |j| cd1= r2[i] cd2= r2[j] if (cd_is_identical(cd1,cd2) or cd_is_very_similar(cd1,cd2)) and (cd_dois_arent_different(cd1,cd2)) dups.push [cd1[:id],cd2[:id]].sort end end end end dups.sort {|a,b| a[0]<=>b[0]} end  | 
  
#by_pubmed_id ⇒ Object
      57 58 59  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 57 def by_pubmed_id canonical_documents.exclude(pubmed_id: nil).group_and_count(:pubmed_id).having {count.function.* > 1}.all.map {|v| v[:pubmed_id]} end  | 
  
#by_scielo_id ⇒ Object
      54 55 56  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 54 def by_scielo_id canonical_documents.exclude(scielo_id: nil).group_and_count(:scielo_id).having {count.function.* > 1}.all.map {|v| v[:scielo_id]} end  | 
  
#by_scopus_id ⇒ Object
      48 49 50  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 48 def by_scopus_id canonical_documents.exclude(scopus_id: nil).group_and_count(:scopus_id).having {count.function.* > 1}.all.map {|v| v[:scopus_id]} end  | 
  
#by_wos_id ⇒ Object
      51 52 53  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 51 def by_wos_id canonical_documents.exclude(wos_id: nil).group_and_count(:wos_id).having {count.function.* > 1}.all.map {|v| v[:wos_id]} end  | 
  
#cd_dois_arent_different(cd1, cd2) ⇒ Object
      65 66 67  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 65 def cd_dois_arent_different(cd1,cd2) !(!cd1[:doi].nil? and !cd2[:doi].nil? and cd1[:doi]!=cd2[:doi]) end  | 
  
#cd_is_identical(cd1, cd2) ⇒ Object
      61 62 63  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 61 def cd_is_identical(cd1,cd2) cd1[:title]==cd2[:title] and cd1[:year]==cd2[:year] and (cd1[:journal].nil? or (cd1[:journal]==cd2[:journal] and cd1[:pages]==cd2[:pages])) end  | 
  
#cd_is_very_similar(cd1, cd2) ⇒ Object
      68 69 70 71 72 73 74 75 76 77 78 79  | 
    
      # File 'lib/buhos/duplicate_analysis.rb', line 68 def cd_is_very_similar(cd1,cd2) t1="#{cd1[:year]} #{cd1[:title]} #{cd1[:authors]} #{cd1[:journal]} #{cd1[:pages]}".gsub(/[^A-Za-z\d\s]/,"").downcase t2="#{cd2[:year]} #{cd2[:title]} #{cd2[:authors]} #{cd2[:journal]} #{cd2[:pages]}".gsub(/[^A-Za-z\d\s]/,"").downcase if(t1.length>10) d=Levenshtein.distance(t1,t2) d<5 #t1==t2 else false end end  |