class Statsample::Bivariate::Tetrachoric
Compute tetrachoric correlation.
The tetrachoric correlation is a measure of bivariate association arising when both observed variates are categorical variables that result from dichotomizing the two undelying continuous variables (Drasgow, 2006). The tetrachoric correlation is a good way to measure rater agreement (Uebersax, 2006)
This class uses Brown (1977) algorithm. You can see FORTRAN code on lib.stat.cmu.edu/apstat/116
Usage¶ ↑
With two variables x and y on a crosstab like this:
------------- | y=0 | y=1 | ------------- x = 0 | a | b | ------------- x = 1 | c | d | -------------
The code will be
tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d) tc.r # correlation tc.se # standard error tc.threshold_y # threshold for y variable tc.threshold_x # threshold for x variable
Reference:¶ ↑
-
Brown, MB. (1977) Algorithm AS 116: the tetrachoric correlation and its standard error. Applied Statistics, 26, 343-351.
-
Drasgow F. (2006).
Polychoric
and polyserial correlations. In Kotz L, Johnson NL (Eds.), Encyclopedia of statistical sciences. Vol. 7 (pp. 69-74). New York: Wiley. -
Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
Constants
- CHALF
- CITER
- CONST
- CONV
- NITER
- RCUT
- RLIMIT
- RequerimentNotMeet
- SQT2PI
- TWOPI
- UPLIM
- W
- X
Attributes
Name on the analysis
Value for tethrachoric correlation
Use ruby version of algorithm. By default, this attribute is set to false, and C version of algorithm is used
Public Class Methods
Creates a new tetrachoric object for analysis
# File lib/statsample/bivariate/tetrachoric.rb, line 146 def initialize(a,b,c,d, opts=Hash.new) @a,@b,@c,@d=a,b,c,d opts_default={ :name=>_("Tetrachoric correlation"), :ruby_engine=>false } @opts=opts_default.merge opts @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k} # # CHECK IF ANY CELL FREQUENCY IS NEGATIVE # raise "All frequencies should be positive" if (@a < 0 or @b < 0 or @c < 0 or @d < 0) compute end
Creates a Tetrachoric
object based on a 2x2 Matrix.
# File lib/statsample/bivariate/tetrachoric.rb, line 93 def self.new_with_matrix(m, opts=Hash.new) Tetrachoric.new(m[0,0], m[0,1], m[1,0],m[1,1], opts) end
Creates a Tetrachoric
object based on two vectors. The vectors are dichotomized previously.
# File lib/statsample/bivariate/tetrachoric.rb, line 98 def self.new_with_vectors(v1,v2, opts=Hash.new) v1a, v2a=Statsample.only_valid(v1,v2) v1a=v1a.dichotomize v2a=v2a.dichotomize raise RequerimentNotMeet, "v1 have only 0" if v1a.factors==[0] raise RequerimentNotMeet, "v2 have only 0" if v2a.factors==[0] a,b,c,d = 0,0,0,0 v1a.each_index{|i| x,y=v1a[i],v2a[i] a+=1 if x==0 and y==0 b+=1 if x==0 and y==1 c+=1 if x==1 and y==0 d+=1 if x==1 and y==1 } Tetrachoric.new(a,b,c,d, opts) end
Public Instance Methods
# File lib/statsample/bivariate/tetrachoric.rb, line 176 def check_frequencies # # CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta # @kdelta = 1 @kdelta = 2 if (@a == 0 or @d == 0) @kdelta += 2 if (@b == 0 or @c == 0) # # kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED # raise RequerimentNotMeet, "Rows and columns should have more than 0 items" if @kdelta==4 end
Compute the tetrachoric correlation
# File lib/statsample/bivariate/tetrachoric.rb, line 170 def compute_optimized check_frequencies t=Statsample::STATSAMPLE__.tetrachoric(@a,@b,@c,@d) raise "Error on calculation of tetrachoric correlation: #{t[:ifault]}" if t[:ifault]>0 @r,@sdr,@itype,@ifault,@zab, @zac = t[:r],t[:sdr],t[:itype],t[:ifault], t[:threshold_x], t[:threshold_y] end
Compute the tetrachoric correlation using ruby Called on object creation.
# File lib/statsample/bivariate/tetrachoric.rb, line 193 def compute_ruby check_frequencies # # INITIALIZATION # @r = 0 sdzero = 0 @sdr = 0 @itype = 0 @ifault = 0 delta = 0 # GOTO (4, 1, 2 , 92), kdelta # # delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0 # if(@kdelta==2) # 1 delta=0.5 @r=-1 if (@a==0 and @d==0) elsif(@kdelta==3) # 2 delta=-0.5 @r=1 if (@b==0 and @c==0) end # 4 if @r!=0 @itype=3 end # # STORE FREQUENCIES IN AA, BB, CC AND DD # @aa = @a + delta @bb = @b - delta @cc = @c - delta @dd = @d + delta @tot = @aa+@bb+@cc+@dd # # CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE # IF (AA * DD - BB * CC) 7, 5, 6 corr_dir=@aa * @dd - @bb * @cc if(corr_dir < 0) # 7 @probaa = @bb.quo(@tot) @probac = (@bb + @dd).quo(@tot) @ksign = 2 # -> 8 else if (corr_dir==0) # 5 @itype=4 end # 6 # # COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS # PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE. # KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED # @probaa = @aa.quo(@tot) @probac = (@aa+@cc).quo(@tot) @ksign=1 end # 8 @probab = (@aa+@bb).quo(@tot) # # COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES # SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED # @zac = Distribution::Normal.p_value(@probac.to_f) @zab = Distribution::Normal.p_value(@probab.to_f) @ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI) # # WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO # if (@r != 0 or @itype > 0) compute_sdzero return true end # # WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED # if (@a == @b and @b == @c) calculate_cosine return true end # # INITIAL ESTIMATE OF CORRELATION IS YULES Y # @rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2) / (@aa * @dd - @bb * @cc).abs @iter = 0 begin # # IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED # #10 if @rr>RCUT gaussian_quadrature return true end # # TETRACHORIC SERIES IS COMPUTED # # INITIALIZATION # va=1.0 vb=@zac.to_f wa=1.0 wb=@zab.to_f term = 1.0 iterm = 0.0 @sum = @probab * @probac deriv = 0.0 sr = @ss #15 begin if(sr.abs<=CONST) # # RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS # sr = sr / CONST va = va * CHALF vb = vb * CHALF wa = wa * CHALF wb = wb * CHALF end # # FORM SUM AND DERIVATIVE OF SERIES # # 20 dr = sr * va * wa sr = sr * @rr / term cof = sr * va * wa # # ITERM COUNTS NO. OF CONSECUTIVE TERMS < CONV # iterm+= 1 iterm=0 if (cof.abs > CONV) @sum = @sum + cof deriv += dr vaa = va waa = wa va = vb wa = wb vb = @zac * va - term * vaa wb = @zab * wa - term * waa term += 1 end while (iterm < 2 or term < 6) # # CHECK IF ITERATION CONVERGED # if((@sum-@probaa).abs <= CITER) @itype=term calculate_sdr return true end # # CALCULATE NEXT ESTIMATE OF CORRELATION # #25 @iter += 1 # # IF TOO MANY ITERATlONS, RUN IS TERMINATED # delta = (@sum - @probaa) / deriv @rrprev = @rr @rr = @rr - delta @rr += 0.5 * delta if(@iter == 1) @rr= RLIMIT if (@rr > RLIMIT) @rr =0 if (@rr < 0.0) end while @iter < NITER raise "Too many iteration" # GOTO 10 end
Standard error
# File lib/statsample/bivariate/tetrachoric.rb, line 115 def se @sdr end
Threshold for variable x (rows) Point on gauss curve under X
rater select cases
# File lib/statsample/bivariate/tetrachoric.rb, line 120 def threshold_x @zab end
Threshold for variable y (columns) Point on gauss curve under Y rater select cases
# File lib/statsample/bivariate/tetrachoric.rb, line 127 def threshold_y @zac end
Private Instance Methods
# File lib/statsample/bivariate/tetrachoric.rb, line 445 def calculate_cosine # # WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED # @rr = -Math::cos(TWOPI * @probaa) @itype = 2 calculate_sdr end
# File lib/statsample/bivariate/tetrachoric.rb, line 162 def compute if !ruby_engine and Statsample::OPTIMIZED and Statsample::STATSAMPLE__.respond_to? :tetrachoric compute_optimized else compute_ruby end end
85
COMPUTE SDZERO
# File lib/statsample/bivariate/tetrachoric.rb, line 481 def compute_sdzero @sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss) @sdr = @sdzero if (@r == 0) end
GAUSSIAN QUADRATURE 40
# File lib/statsample/bivariate/tetrachoric.rb, line 375 def gaussian_quadrature if(@iter==0) # INITIALIZATION, IF THIS IS FIRST ITERATION @sum=@probab*@probac @rrprev=0 end # 41 sumprv = @probab - @sum @prob = @bb.quo(@tot) @prob = @aa.quo(@tot) if (@ksign == 2) @itype = 1 # # LOOP TO FIND ESTIMATE OF CORRELATION # COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE # # 42 begin rrsq = Math::sqrt(1 - @rr ** 2) amid = 0.5 * (UPLIM + @zac) xlen = UPLIM - amid @sum = 0 (1..16).each do |iquad| xla = amid + X[iquad] * xlen xlb = amid - X[iquad] * xlen # # TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED # tempa = (@zab - @rr * xla) / rrsq if (tempa >= -6.0) @sum = @sum + W[iquad] * Math::exp(-0.5 * xla ** 2) * Distribution::Normal.cdf(tempa) end tempb = (@zab - @rr * xlb) / rrsq if (tempb >= -6.0) @sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb) end end # 44 ~ iquad @sum=@sum*xlen / SQT2PI # # CHECK IF ITERATION HAS CONVERGED # if ((@prob - @sum).abs <= CITER) calculate_sdr return true end # ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION rrest = ((@prob - @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum) rrest = RLIMIT if (rrest > RLIMIT) rrest = 0 if (rrest < 0) @rrprev = @rr @rr = rrest sumprv = @sum # # if estimate has same value on two iterations, stop iteration # if @rr == @rrprev calculate_sdr return true end end while @iter < NITER raise "Too many iterations" # ir a 42 end