Article: Mondrian Multidimensional K-Anonymity

Lame Ruby Implementation:

# ==================================================================================
# anonymization: group.rb
# ==================================================================================
ENVIRONMENT = 'release' #'release'

require 'set'
require 'rubygems'
require 'ruby-debug' if ENVIRONMENT == 'debug'

# ==================================================================================
# class group
#
# usage:
#  require 'group'
#
#  g = Group.new <quasi_ids>, <filename>
#  g.anonymize <k>
#
# example:
#
# lefevre.db
#
#     0             2   < -- quasi_ids
#
#   |age|  sex  | zipc | disease      |
#---+---+-------+------+--------------+--
# 0 | 25  Male    53711 Flu           |
# 1 | 25  Female  53712 Hepatitis     |
# 2 | 26  Male    53711 Bronchitis    |
# 3 | 27  Male    53710 Broken_Arm    |
# 4 | 27  Female  53712 AIDS          |
# 5 | 28  Male    53711 Hang_Nail     |
#---+---+-------+------+--------------+--
#
# irb
#  >> require 'group'
#  >> g = Group.new [0,2], 'lefevre.db'
#  >> g.anonymize 2, 'degen'
# ==================================================================================
class Group
  # create a setter method for @tuples, @filename
  # so that g.tuples = x works
  attr_writer :tuples, :filename

  @@debug = { 'best_attribute' => ENVIRONMENT == 'debug',
              'intersection'   => ENVIRONMENT == 'debug',
              'split'          => ENVIRONMENT == 'debug',
              'ordering'       => ENVIRONMENT == 'debug',
              'vars'           => ENVIRONMENT == 'debug',
              'args'           => ENVIRONMENT == 'debug'
           }
  # ================================================================================
  # to create a new group with Group.new
  # ================================================================================
  # needs to remove the full_ids from the read.
  def initialize(quasi_ids, filename, depth=0, available_ids=nil)
    # if no valid attributes are given quasi are used
    available_ids = quasi_ids if available_ids.nil?

    # initialize the instance vars
    @tuples = []
    @quasi_ids = quasi_ids
    @available_ids = available_ids
    @depth = depth

    # serves as wilcard so that no file is read on recursion
    filename == '*wc' ? @filename = nil : @filename = filename

    if @@debug['args'] and @depth == 0
      debug_puts "args : file => #{@filename}"
      debug_puts "args : k => #{@k}"
      debug_puts "args : quasi_ids => #{@quasi_ids.to_s}"
    end

    # run the read and backup procedures
    read
  end

  # ================================================================================
  # anonymization
  # ================================================================================
  def anonymize(k, heuristic='degen', partial_order=[])

    if @@debug['vars']
      #debug_puts "dvars : @tuples #{@tuples}"
      debug_puts "dvars : @available_ids #{@available_ids},"
      debug_puts "dvars : @depth #{@depth}"
    end

    # stop case
    if isnt_splittable? k
      debug_puts "dsplit: no split available for k-level #{k} with size" +
                 " #{@tuples.size}" if @@debug['split']

      # sort and generalize remaining attributes
      @available_ids.each do |attribute|
        sort attribute
        generalize attribute
      end

      # exit
      return
    end

    # where and in what attribute should we split
    # these functions have a heavy effect on the usefulness of the information
    # for the k-anonymity table
    split_attribute  = find_split_attribute @available_ids, heuristic, partial_order
    split_pos        = find_split_position split_attribute

    # create the groups for the
    # recursion
    group1 = Group.new @quasi_ids, '*wc', @depth + 1, @available_ids.clone
    group2 = Group.new @quasi_ids, '*wc', @depth + 1, @available_ids.clone

    # split at the given position
    split split_pos, group1, group2

    if split_groups_satisfy_k_anonymity?(k,group1,group2)

      debug_puts "dsplit: no more split available with attribute" +
          " #{split_attribute} (g1: #{group1.size}, g2: #{group2.size})" if @@debug['split']

      # generalize by split_attribute and then remove it from the available
      # attributes array
      generalize split_attribute
      @available_ids.delete split_attribute

      # anonymize remaining available attributes
      anonymize k, heuristic, partial_order

    else # splitting successful
      debug_puts "dsplit: splitting on attribute #{split_attribute} at" +
                 " position #{split_pos} of #{@tuples.size}" if @@debug['split']

      # assign the two groups to this instance
      @group1 = group1
      @group2 = group2

      group1.anonymize k, heuristic, partial_order
      group2.anonymize k, heuristic, partial_order

      #@tuples = []
    end
  end

  # ================================================================================
  # io and backup related
  # ================================================================================
  # read @tuples from @filename
  def read
    unless @filename.nil?
      f = File.open @filename
      f.each_line do |line|
        @tuples < < line.rstrip.split("\t\t")
      end
      f.close
    end
  end

  # reset the class to reuse
  def reset
    @available_ids  = @originally_available_ids
    @tuples = []
    read
  end

  # ================================================================================
  # overrides
  # ================================================================================
  # number of tuples
  def size
    @tuples.size
  end

  # ================================================================================
  # aux
  # ================================================================================
  # to_s

  def to_s

    str = ""

    unless @tuples.empty?
      @tuples.each do |line|
        @tuples[0].size.times { |i| str << line[i].to_s + "\t\t"}
        str << "\n"
      end
    end

    str
  end

  # shows a yaml representation of internal object
  def to_y
    require 'yaml'
    y self
  end

  private

  def debug_puts(message)
    ident=''
    @depth.times {|i| ident+="  "}
    puts ident + message
  end

  # ================================================================================
  # aux for anonymization
  # ================================================================================
  # finds the attribute with the largest range. According to LeFevre this is a good
  # heuristic to find the attribute on
  def find_split_attribute(attributes_list, heuristic, partial_order)

    debug_puts "dorder: choosing from" +
               " #{attributes_list.to_s}" if @@debug['ordering']

    best_attrib = -1
    best_attrib_count = 0.0

    attributes_list = find_minimal_elements partial_order, attributes_list

    debug_puts "dorder: minimal list is" +
               " #{attributes_list.to_s}" if @@debug['ordering']

    attributes_list.each do |attribute|
      values = @tuples.map{|t| t[attribute]}.to_set

      # degen heuristic: split on the attribute that had more degeneracy
      if heuristic == 'degen'
        if values.size < best_attrib_count or best_attrib == -1
          best_attrib = attribute
          best_attrib_count = @tuples.size.to_f / values.size.to_f
        end
      elsif heuristic == 'single'
        if values.size < best_attrib_count or best_attrib == -1
          best_attrib = attribute
          best_attrib_count = values.size
        end
      else #default
        if values.size > best_attrib_count
          best_attrib = attribute
          best_attrib_count = values.size
        end
      end
    end

    debug_puts "dbest : best atribute is #{best_attrib} with" +
               " count #{best_attrib_count}" if @@debug['best_attribute']

    return best_attrib
  end

  #  returns the position of the leftmost or rightmost median element.
  #  used to split in lhs and rhs
  def find_split_position(attribute_id)
    sort attribute_id

    median_pos = @tuples.size / 2
    median = @tuples[median_pos][attribute_id]

    split_pos_high = median_pos
    split_pos_low  = median_pos

    # split point correspond to highest index that has median value
    split_pos_high += 1 while (@tuples.size >= split_pos_high + 2) and
                              (@tuples[split_pos_high + 1][attribute_id] == median)

    high_smaller_group_size =
            [split_pos_high + 1, @tuples.size - split_pos_high - 1].min

    # split point correspond to lowest index that has median value
    split_pos_low -= 1 while (split_pos_low > 1) and
                              (@tuples[split_pos_low - 1][attribute_id] == median)

    low_smaller_group_size =
            [split_pos_low, @tuples.size - split_pos_low].min

    # choose the one with the largest group
    if high_smaller_group_size > low_smaller_group_size
      split_pos = split_pos_high
    else
      split_pos = split_pos_low - 1
    end

    return split_pos
  end

  # finds minimal elements from the list of the given attribute list according to
  # partial order specified in partial_order. partial_order contains all complete chains.
  def find_minimal_elements(partial_order, possible_elements)

    if partial_order.empty?
      debug_puts "dorder: no ordering specified" if @@debug['ordering']

      return possible_elements
    end

    # choose all possible_elements that arent in partial_order
    # those are minimal
    minimal_list = possible_elements.select { |element| !partial_order.flatten.member?(element) }

    # haskell goodies ^^
    # restrict partial_order to values in possible_elements
    restricted_partial_order = partial_order.map { |l| l.select { |element| possible_elements.member?(element) } }

    if @@debug['ordering']
      debug_puts "dorder: possible_elements list is" +
                 " #{possible_elements.to_s}"
      debug_puts "dorder: partial_order list is" +
                 " #{partial_order.to_s}"
      debug_puts "dorder: restricted_partial_order is" +
                 " #{restricted_partial_order.to_s}"
    end

   non_zero_chains = restricted_partial_order.select { |chain| not chain.empty? }

   non_zero_chains.each do |c|
     candidate = c[0]

     minimal = !restricted_partial_order.any? do |chain|
        chain.member?(candidate) and chain[0] != candidate
     end

     if minimal and not minimal_list.member?(candidate)
       minimal_list << candidate
     end
   end

   return minimal_list
  end

  # replaces attribute value with generalization that cover all tuples.
  # Expects tuples to be sorted by attribute.
  def generalize(attribute)
    min_val = @tuples[0][attribute]
    max_val = @tuples[-1][attribute]

    unless min_val == max_val
      @tuples.each do |t|
        t[attribute] = [min_val, max_val]
      end
    end

  end

  def split(split_pos, group1, group2)
    group1.tuples = @tuples[0..split_pos]
    group2.tuples = @tuples[split_pos+1..@tuples.size]
  end

  def sort(attribute)
    @tuples = @tuples.sort_by { |t| t[attribute] }
  end

  # ================================================================================
  # verbose conditions
  # ================================================================================
  def isnt_splittable?(k)
    k < 2 or group_cant_be_split_for_level?(k) or no_split_attributes_are_available?
  end

  def group_cant_be_split_for_level?(k)
    @tuples.size < 2*k
  end

  def no_split_attributes_are_available?
    @available_ids.empty?
  end

  def split_groups_satisfy_k_anonymity?(k,group1,group2)
    group1.size < k or group2.size < k
  end
end

# hack on array to display lists correctly
class Array
  def to_s
    "[" + self.join(',') + "]"
  end
end

One Response to “Mondrian Multidimensional K-Anonymity in Ruby”


  1. [...] a favor para qualquer maluquinho da segurança dos dados. E todos tiveram a oportunidade de ver que implementei o Mondrian no ultimo semestre no âmbito da unidade curricular de criptografia e segurança de sistemas [...]


Leave a Reply