class Statsample::Bivariate::Tetrachoric

Compute tetrachoric correlation.

The tetrachoric correlation is a measure of bivariate association arising when both observed variates are categorical variables that result from dichotomizing the two undelying continuous variables (Drasgow, 2006). The tetrachoric correlation is a good way to measure rater agreement (Uebersax, 2006)

This class uses Brown (1977) algorithm. You can see FORTRAN code on lib.stat.cmu.edu/apstat/116

Usage

With two variables x and y on a crosstab like this:

      -------------
      | y=0 | y=1 |
      -------------
x = 0 |  a  |  b  |
      -------------
x = 1 |  c  |  d  |
      -------------

The code will be

tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
tc.r # correlation
tc.se # standard error
tc.threshold_y # threshold for y variable
tc.threshold_x # threshold for x variable

Reference:

Constants

CHALF
CITER
CONST
CONV
NITER
RCUT
RLIMIT
RequerimentNotMeet
SQT2PI
TWOPI
UPLIM
W
X

Attributes

name[RW]

Name on the analysis

r[R]

Value for tethrachoric correlation

ruby_engine[RW]

Use ruby version of algorithm. By default, this attribute is set to false, and C version of algorithm is used

Public Class Methods

new(a,b,c,d, opts=Hash.new) click to toggle source

Creates a new tetrachoric object for analysis

# File lib/statsample/bivariate/tetrachoric.rb, line 146
def initialize(a,b,c,d, opts=Hash.new)
  @a,@b,@c,@d=a,b,c,d
  
  opts_default={
    :name=>_("Tetrachoric correlation"),
    :ruby_engine=>false
  }
  
  @opts=opts_default.merge opts
  @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k}
  #
  #       CHECK IF ANY CELL FREQUENCY IS NEGATIVE
  #
  raise "All frequencies should be positive" if  (@a < 0 or @b < 0 or @c < 0  or @d < 0)
  compute
end
new_with_matrix(m, opts=Hash.new) click to toggle source

Creates a Tetrachoric object based on a 2x2 Matrix.

# File lib/statsample/bivariate/tetrachoric.rb, line 93
def self.new_with_matrix(m, opts=Hash.new)
  Tetrachoric.new(m[0,0], m[0,1], m[1,0],m[1,1], opts)
end
new_with_vectors(v1,v2, opts=Hash.new) click to toggle source

Creates a Tetrachoric object based on two vectors. The vectors are dichotomized previously.

# File lib/statsample/bivariate/tetrachoric.rb, line 98
def self.new_with_vectors(v1,v2, opts=Hash.new)
  v1a, v2a=Statsample.only_valid(v1,v2)
  v1a=v1a.dichotomize
  v2a=v2a.dichotomize
  raise RequerimentNotMeet, "v1 have only 0" if v1a.factors==[0]
  raise RequerimentNotMeet, "v2 have only 0" if v2a.factors==[0]
  a,b,c,d = 0,0,0,0
  v1a.each_index{|i|
    x,y=v1a[i],v2a[i]
    a+=1 if x==0 and y==0
    b+=1 if x==0 and y==1
    c+=1 if x==1 and y==0
    d+=1 if x==1 and y==1
  }
  Tetrachoric.new(a,b,c,d, opts)
end

Public Instance Methods

check_frequencies() click to toggle source
# File lib/statsample/bivariate/tetrachoric.rb, line 176
def check_frequencies
  #
  #       CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta
  #
  @kdelta = 1
 
  @kdelta  = 2 if (@a == 0 or @d == 0)
  @kdelta += 2 if (@b == 0 or @c == 0)
  #
  #        kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED
  #

  raise RequerimentNotMeet, "Rows and columns should have more than 0 items" if @kdelta==4
end
compute_optimized() click to toggle source

Compute the tetrachoric correlation

# File lib/statsample/bivariate/tetrachoric.rb, line 170
def compute_optimized
  check_frequencies        
  t=Statsample::STATSAMPLE__.tetrachoric(@a,@b,@c,@d)
  raise "Error on calculation of tetrachoric correlation: #{t[:ifault]}" if t[:ifault]>0
  @r,@sdr,@itype,@ifault,@zab, @zac = t[:r],t[:sdr],t[:itype],t[:ifault], t[:threshold_x], t[:threshold_y]
end
compute_ruby() click to toggle source

Compute the tetrachoric correlation using ruby Called on object creation.

# File lib/statsample/bivariate/tetrachoric.rb, line 193
def compute_ruby
  check_frequencies
  #
  # INITIALIZATION
  #
  @r = 0
  sdzero = 0
  @sdr = 0
  @itype = 0
  @ifault = 0
  delta  = 0
  

  #      GOTO (4, 1, 2 , 92), kdelta
  #
  #        delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0
  #

  if(@kdelta==2)
    # 1
    delta=0.5
    @r=-1 if (@a==0 and @d==0)
  elsif(@kdelta==3)
    # 2
    delta=-0.5
    @r=1 if (@b==0 and @c==0)
  end
  # 4
  if @r!=0
    @itype=3
  end

  #
  #        STORE FREQUENCIES IN  AA, BB, CC AND DD
  #
  @aa = @a + delta
  @bb = @b - delta
  @cc = @c - delta
  @dd = @d + delta
  @tot = @aa+@bb+@cc+@dd
  #
  #        CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE
  #        IF (AA * DD - BB * CC) 7, 5, 6

  corr_dir=@aa * @dd - @bb * @cc
  if(corr_dir < 0)
    # 7
    @probaa = @bb.quo(@tot)
    @probac = (@bb + @dd).quo(@tot)
    @ksign = 2
    # ->  8
  else
    if (corr_dir==0)
      # 5
      @itype=4
    end
    # 6
    #
    #        COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS
    #        PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE.
    #        KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED
    #

    @probaa = @aa.quo(@tot)
    @probac = (@aa+@cc).quo(@tot)
    @ksign=1
  end
  # 8

  @probab = (@aa+@bb).quo(@tot)

  #
  #        COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES
  #        SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED
  #
  @zac = Distribution::Normal.p_value(@probac.to_f)
  @zab = Distribution::Normal.p_value(@probab.to_f)
  @ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI)
  #
  #        WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO
  #
  if (@r != 0 or @itype > 0)
    compute_sdzero
    return true
  end
  #
  #        WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED
  #
  if (@a == @b and @b == @c)
    calculate_cosine
    return true
  end
  #
  #        INITIAL ESTIMATE OF CORRELATION IS YULES Y
  #
  @rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2)  / (@aa * @dd - @bb * @cc).abs
  @iter = 0
  begin
    #
    #        IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED
    #
    #10
    if @rr>RCUT
      gaussian_quadrature
      return true
    end
    #
    #        TETRACHORIC SERIES IS COMPUTED
    #
    #        INITIALIZATION
    #
    va=1.0
    vb=@zac.to_f
    wa=1.0
    wb=@zab.to_f
    term = 1.0
    iterm = 0.0
    @sum = @probab * @probac
    deriv = 0.0
    sr = @ss
    #15
    begin
      if(sr.abs<=CONST)
        #
        #        RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS
        #
        sr = sr  / CONST
        va = va * CHALF
        vb = vb * CHALF
        wa = wa * CHALF
        wb = wb * CHALF
      end
      #
      #        FORM SUM AND DERIVATIVE OF SERIES
      #
      #  20
      dr = sr * va * wa
      sr = sr * @rr / term
      cof = sr * va * wa
      #
      #        ITERM COUNTS NO. OF CONSECUTIVE TERMS  <  CONV
      #
      iterm+=  1
      iterm=0 if (cof.abs > CONV)
      @sum = @sum + cof
      deriv += dr
      vaa = va
      waa = wa
      va = vb
      wa = wb
      vb = @zac * va - term * vaa
      wb = @zab * wa - term * waa
      term += 1
    end while (iterm < 2 or term < 6)
    #
    #        CHECK IF ITERATION CONVERGED
    #
    if((@sum-@probaa).abs <= CITER)
      @itype=term
      calculate_sdr
      return true
    end
    #
    #        CALCULATE NEXT ESTIMATE OF CORRELATION
    #
    #25
    @iter += 1
    #
    #        IF TOO MANY ITERATlONS, RUN IS TERMINATED
    #
    delta = (@sum - @probaa) /  deriv
    @rrprev = @rr
    @rr = @rr - delta
    @rr += 0.5 * delta if(@iter == 1)
    @rr= RLIMIT if (@rr > RLIMIT)
    @rr =0 if (@rr  <  0.0)
  end while @iter < NITER
  raise "Too many iteration"
  #  GOTO 10
end
se() click to toggle source

Standard error

# File lib/statsample/bivariate/tetrachoric.rb, line 115
def se
  @sdr
end
threshold_x() click to toggle source

Threshold for variable x (rows) Point on gauss curve under X rater select cases

# File lib/statsample/bivariate/tetrachoric.rb, line 120
def threshold_x
  @zab
end
threshold_y() click to toggle source

Threshold for variable y (columns) Point on gauss curve under Y rater select cases

# File lib/statsample/bivariate/tetrachoric.rb, line 127
def threshold_y
  @zac
end

Private Instance Methods

calculate_cosine() click to toggle source
# File lib/statsample/bivariate/tetrachoric.rb, line 445
def calculate_cosine
  #
  #        WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED
  #
  @rr = -Math::cos(TWOPI * @probaa)
  @itype = 2
  calculate_sdr
end
compute() click to toggle source
# File lib/statsample/bivariate/tetrachoric.rb, line 162
def compute
  if !ruby_engine and Statsample::OPTIMIZED and Statsample::STATSAMPLE__.respond_to? :tetrachoric
    compute_optimized
  else
    compute_ruby
  end
end
compute_sdzero() click to toggle source

85

COMPUTE SDZERO
# File lib/statsample/bivariate/tetrachoric.rb, line 481
def compute_sdzero
  @sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss)
  @sdr = @sdzero if (@r == 0)
end
gaussian_quadrature() click to toggle source

GAUSSIAN QUADRATURE 40

# File lib/statsample/bivariate/tetrachoric.rb, line 375
def gaussian_quadrature
  if(@iter==0)
    # INITIALIZATION, IF THIS IS FIRST ITERATION
    @sum=@probab*@probac
    @rrprev=0
  end

  # 41
  sumprv = @probab - @sum
  @prob = @bb.quo(@tot)
  @prob = @aa.quo(@tot) if (@ksign == 2)
  @itype = 1
  #
  # LOOP TO FIND ESTIMATE OF CORRELATION
  #  COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE
  #
  # 42

  begin
    rrsq = Math::sqrt(1 - @rr ** 2)
    amid = 0.5 * (UPLIM + @zac)
    xlen = UPLIM - amid
    @sum = 0
    (1..16).each do |iquad|
      xla = amid + X[iquad] * xlen
      xlb = amid - X[iquad] * xlen


      #
      #       TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED
      #
      tempa = (@zab - @rr * xla) / rrsq
      if (tempa >= -6.0)
        @sum = @sum + W[iquad] * Math::exp(-0.5  * xla ** 2) * Distribution::Normal.cdf(tempa)
      end
      tempb = (@zab - @rr * xlb) / rrsq

      if (tempb >= -6.0)
        @sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb)
      end
    end # 44 ~ iquad
    @sum=@sum*xlen / SQT2PI
    #
    # CHECK IF ITERATION HAS CONVERGED
    #
    if ((@prob - @sum).abs <= CITER)
      calculate_sdr
      return true
    end
    # ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION

    rrest = ((@prob -  @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum)
    rrest = RLIMIT if (rrest > RLIMIT)
    rrest = 0 if (rrest < 0)
    @rrprev = @rr
    @rr = rrest
    sumprv = @sum
    #
    #        if estimate has same value on two iterations, stop iteration
    #
    if @rr == @rrprev
      calculate_sdr
      return true
    end


  end while @iter < NITER
  raise "Too many iterations"
  # ir a 42
end