Michal Hlavinka
2014-06-25 10:27:21 UTC
Hi,
we found a bug caused by compiler optimizations.
The top of stack looks like this:
#0 job_chksave (pid=5066) at jobs.c:1949
#1 job_reap (sig=17) at obs.c:428
#2 <signal handler called>
#3 job_subsave () at sh/jobs.c:1990
#4 sh_subshell (shp=0x76cba0, t=0x7fd6050c9fe0, flags=4, comsub=3) at
subshell.c:520
It's based on patched 2012-08-01 so line numbers won't match precisely.
The interesting part is that job_reap was executed during job_subsave
while job_lock should prevent this:
void *job_subsave(void)
{
struct back_save *bp = new_of(struct back_save,0);
job_lock();
*bp = bck;
bp->prev = bck.prev;
bck.count = 0;
bck.list = 0; <<---- HERE when signal came
bck.prev = bp;
job_unlock();
return((void*)bp);
}
when checking the job.in_critical, gdb showed that it is zero
(gdb) p job.in_critical
$1 = 0
So the locking was not effective. I've checked the disassembled
job_subsave and to no surprise:
?0x429801 <job_subsave+1> mov $0x18,%edi
?0x429806 <job_subsave+6> callq 0x405de0 <malloc at plt>
?0x42980b <job_subsave+11> mov 0x343e9e(%rip),%rdx #
0x76d6b0 <bck>
?0x429812 <job_subsave+18> mov %rax,%rbx
?0x429815 <job_subsave+21> mov 0x343bed(%rip),%eax #
0x76d408 <job+40>
?0x42981b <job_subsave+27> movl $0x0,0x343e8b(%rip) #
0x76d6b0 <bck>
?0x429825 <job_subsave+37> mov %rdx,(%rbx)
?0x429828 <job_subsave+40> mov 0x343e89(%rip),%rdx #
0x76d6b8 <bck+8>
?0x42982f <job_subsave+47> test %eax,%eax
?0x429831 <job_subsave+49> movq $0x0,0x343e7c(%rip) #
0x76d6b8 <bck+8>
?0x42983c <job_subsave+60> mov %rdx,0x8(%rbx)
?0x429840 <job_subsave+64> mov 0x343e79(%rip),%rdx #
0x76d6c0 <bck+16>
?0x429847 <job_subsave+71> mov %rbx,0x343e72(%rip) #
0x76d6c0 <bck+16>
?0x42984e <job_subsave+78> mov %rdx,0x10(%rbx)
?0x429852 <job_subsave+82> jne 0x429887 <job_subsave+135>
?0x429854 <job_subsave+84> mov 0x343bb2(%rip),%edi #
0x76d40c <job+44>
?0x42985a <job_subsave+90> test %edi,%edi
?0x42985c <job_subsave+92> je 0x429887 <job_subsave+135>
?0x42985e <job_subsave+94> mov 0x341ca3(%rip),%rax #
0x76b508 <Vmregion>
?0x429865 <job_subsave+101> movl $0x1,0x343b99(%rip) #
0x76d408 <job+40>
?0x42986f <job_subsave+111> mov 0x60(%rax),%rdx
?0x429873 <job_subsave+115> mov $0x1,%eax
?0x429878 <job_subsave+120> mov (%rdx),%esi
?0x42987a <job_subsave+122> test %esi,%esi
?0x42987c <job_subsave+124> je 0x429890 <job_subsave+144>
?0x42987e <job_subsave+126> sub $0x1,%eax
?0x429881 <job_subsave+129> mov %eax,0x343b81(%rip) #
0x76d408 <job+40>
?0x429887 <job_subsave+135> mov %rbx,%rax
?0x42988a <job_subsave+138> pop %rbx
?0x42988b <job_subsave+139> retq
?0x42988c <job_subsave+140> nopl 0x0(%rax)
?0x429890 <job_subsave+144> callq 0x428d60 <job_reap>
?0x429895 <job_subsave+149> mov 0x343b6d(%rip),%eax #
0x76d408 <job+40>
?0x42989b <job_subsave+155> jmp 0x42987e <job_subsave+126>
there is no job.in_critical AKA <job+40> ++ nor --
Even with volatile attribute, gcc reorders the code so it locks,
immediately decrements in_critical again as part of unlocking and then
do the code that's supposed to be lock protected. Adding memory barriers
was necessary to prevent compiler from reorganizing the code.
See the attached patch.
Michal
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ksh-20120801-locking.patch
Type: text/x-patch
Size: 1298 bytes
Desc: not available
URL: <http://lists.research.att.com/pipermail/ast-developers/attachments/20140625/91d6c666/attachment.bin>
we found a bug caused by compiler optimizations.
The top of stack looks like this:
#0 job_chksave (pid=5066) at jobs.c:1949
#1 job_reap (sig=17) at obs.c:428
#2 <signal handler called>
#3 job_subsave () at sh/jobs.c:1990
#4 sh_subshell (shp=0x76cba0, t=0x7fd6050c9fe0, flags=4, comsub=3) at
subshell.c:520
It's based on patched 2012-08-01 so line numbers won't match precisely.
The interesting part is that job_reap was executed during job_subsave
while job_lock should prevent this:
void *job_subsave(void)
{
struct back_save *bp = new_of(struct back_save,0);
job_lock();
*bp = bck;
bp->prev = bck.prev;
bck.count = 0;
bck.list = 0; <<---- HERE when signal came
bck.prev = bp;
job_unlock();
return((void*)bp);
}
when checking the job.in_critical, gdb showed that it is zero
(gdb) p job.in_critical
$1 = 0
So the locking was not effective. I've checked the disassembled
job_subsave and to no surprise:
?0x429801 <job_subsave+1> mov $0x18,%edi
?0x429806 <job_subsave+6> callq 0x405de0 <malloc at plt>
?0x42980b <job_subsave+11> mov 0x343e9e(%rip),%rdx #
0x76d6b0 <bck>
?0x429812 <job_subsave+18> mov %rax,%rbx
?0x429815 <job_subsave+21> mov 0x343bed(%rip),%eax #
0x76d408 <job+40>
?0x42981b <job_subsave+27> movl $0x0,0x343e8b(%rip) #
0x76d6b0 <bck>
?0x429825 <job_subsave+37> mov %rdx,(%rbx)
?0x429828 <job_subsave+40> mov 0x343e89(%rip),%rdx #
0x76d6b8 <bck+8>
?0x42982f <job_subsave+47> test %eax,%eax
?0x429831 <job_subsave+49> movq $0x0,0x343e7c(%rip) #
0x76d6b8 <bck+8>
?0x42983c <job_subsave+60> mov %rdx,0x8(%rbx)
?0x429840 <job_subsave+64> mov 0x343e79(%rip),%rdx #
0x76d6c0 <bck+16>
?0x429847 <job_subsave+71> mov %rbx,0x343e72(%rip) #
0x76d6c0 <bck+16>
?0x42984e <job_subsave+78> mov %rdx,0x10(%rbx)
?0x429852 <job_subsave+82> jne 0x429887 <job_subsave+135>
?0x429854 <job_subsave+84> mov 0x343bb2(%rip),%edi #
0x76d40c <job+44>
?0x42985a <job_subsave+90> test %edi,%edi
?0x42985c <job_subsave+92> je 0x429887 <job_subsave+135>
?0x42985e <job_subsave+94> mov 0x341ca3(%rip),%rax #
0x76b508 <Vmregion>
?0x429865 <job_subsave+101> movl $0x1,0x343b99(%rip) #
0x76d408 <job+40>
?0x42986f <job_subsave+111> mov 0x60(%rax),%rdx
?0x429873 <job_subsave+115> mov $0x1,%eax
?0x429878 <job_subsave+120> mov (%rdx),%esi
?0x42987a <job_subsave+122> test %esi,%esi
?0x42987c <job_subsave+124> je 0x429890 <job_subsave+144>
?0x42987e <job_subsave+126> sub $0x1,%eax
?0x429881 <job_subsave+129> mov %eax,0x343b81(%rip) #
0x76d408 <job+40>
?0x429887 <job_subsave+135> mov %rbx,%rax
?0x42988a <job_subsave+138> pop %rbx
?0x42988b <job_subsave+139> retq
?0x42988c <job_subsave+140> nopl 0x0(%rax)
?0x429890 <job_subsave+144> callq 0x428d60 <job_reap>
?0x429895 <job_subsave+149> mov 0x343b6d(%rip),%eax #
0x76d408 <job+40>
?0x42989b <job_subsave+155> jmp 0x42987e <job_subsave+126>
there is no job.in_critical AKA <job+40> ++ nor --
Even with volatile attribute, gcc reorders the code so it locks,
immediately decrements in_critical again as part of unlocking and then
do the code that's supposed to be lock protected. Adding memory barriers
was necessary to prevent compiler from reorganizing the code.
See the attached patch.
Michal
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ksh-20120801-locking.patch
Type: text/x-patch
Size: 1298 bytes
Desc: not available
URL: <http://lists.research.att.com/pipermail/ast-developers/attachments/20140625/91d6c666/attachment.bin>