Article: 4225 of comp.lang.perl Xref: feenix.metronet.com comp.lang.perl:4225 Newsgroups: comp.lang.perl Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!ames!olivea!apple.com!voder!berlioz.nsc.com!jedi!arielf From: arielf@mirage.nsc.com (Ariel Faigon) Subject: ckbal - check balanced tokens in C source (perl script) Message-ID: <1993Jul14.202605.13360@berlioz.nsc.com> Followup-To: comp.lang.perl Keywords: Reiser-cpp tokens balance C-language-tool Sender: news@berlioz.nsc.com (UseNet News account) Reply-To: arielf@mirage.nsc.com Organization: National Semiconductor Corp. Date: Wed, 14 Jul 1993 20:26:05 GMT Lines: 345 Have you ever been frustrated by the Reiser-cpp or pcc insufficient data in error messages? Have you ever got a message like "1073: missing #endif" without a reference to the line where the opening #if appears. Have you ever left out a closing comment and searched your source for the error in the wrong place? If so, 'ckbal' can help you. I wrote this about a year ago in an evening of such frustration, and just thought it might be a good idea to post this for the benefit of all. Sorry, the man page is not wrapped, and it doesn't handle C++ style comments. yet it saved me and my colleagues many long hours in the past year or so. Enjoy. #!/usr/local/bin/perl # On Messy-DOS systems start with the following 3 line prologue: @REM=(qq! @perl %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 @goto end !) if 0 ; # # ckbal - check balanced tokens in C source. # # Usage: # ckbal [-d] [-w] C-files... # # checked token-pairs include # ( ) # [ ] # { } # /* */ # " " # ' ' # #ifxxx [#els...] #endif # # Errors detected: # o EOF hit with unclosed token (opened on line X) # o Closing token on line X has no matching open token # # Options: # -w Adds warnings about potential errors: # Unmatched token pairs within strings or comments # (e.g. Open comment within a comment) # # -d Adds debugging printouts # # States during parsing: # o In comment # o In string # o In char constant # o In code # # TODO: # o Make it simpler - more general, more table driven # but for a first shot it serves its purpose... # o Add C++, // style comments # BUGS: # o A character (single quoted) string is arbitrary long # o There are cases where only '-w' will spot the problem # # Author: Ariel Faigon, (arielf@mirage.nsc.com), May 13, 1992 # Donated to the public domain - please leave author data intact. Enjoy. # # I use eval because it may not work on my DOS, your mileage may vary. eval "require 'getopts.pl'"; unless ($@) { &Getopts('dw'); } # -- Opening -> closing token mapping %pairof = ( '(', ')', '[', ']', '{', '}', '/*', '*/', '"', '"', "'", "'", '#if', '#endif', '#ifdef', '#endif', '#ifndef', '#endif', ); # -- State transitions: state + token -> new-state %next_state = ( "code$;/*", 'comment', "code$;\"", 'string', # " close quote to keep 4dos happy "code$;'", 'char', "comment$;*/", 'code', "string$;\"", 'code', # " close quote to keep 4dos happy "char$;'", 'code' ); # -- single-char-codes -> real-tokens # text is 'canonicalized' using single char codes for convenience + efficiency %realtok_of = ( "\201", '/*', "\202", '*/', "\211", '#if', "\212", '#ifdef', "\213", '#ifndef', "\214", '#else', "\215", '#elsif', "\216", '#endif' ); # Given current state and token, determine if a token is "opening" # Should probably be another assoc table... sub opening { local($state, $tok) = @_; ($tok =~ m,[\[({]|/\*, || $state eq 'code' && $tok =~ /["']/ || $tok =~ /^#if/) ? 1 : 0; } # Given current state and token determine if a token is "closing" # Should probably be another assoc table... sub closing { local($state, $tok) = @_; ($tok =~ m,[])}]|\*/, || $tok eq "'" && $state eq 'char' || $tok eq '"' && $state eq 'string' || $tok eq '#endif') ? 1 : 0; } # -- pop and cleanup all remaining 'unclosed' tokens from stack sub cleanup_pop { local(*stack) = @_; local($tok, $line, $prefix); $prefix = ''; unless ($state eq 'code' || $at_eof) { $prefix = "warning (in $state): "; } while (@stack) { $tok = pop(@stack); $line = pop(@stack); if ($opt_w || !$prefix) { print STDERR "$0: $prefix$file, $line: '$tok': open without close\n"; } } } $0 =~ s,.*/,,; # Trim pathname for error messages unless (@ARGV) { print STDERR "ckbal: check balanced tokens in C source Usage: ckbal [-d] [-w] C-files... Checked token-pairs include: ( ), [ ], { }, /* */, \" \", ' ' #ifxxx [#els...] #endif Options: -w Adds warnings about potential errors: Unmatched token pairs within strings or comments (e.g. Open comment within a comment) -d Adds debugging printouts "; exit(1); } # -- main program foreach $file (@ARGV) { &dofile; } # -- process one file sub dofile { local(*stack); &canonicalize($file); if ($opt_d) { print "after canonicalize: '@tokens'\n"; } $line = 1; $state = 'code'; $at_eof = 0; while ($tok = shift(@tokens)) { if ($tok eq "\n") { $line++; next; } # Move to a new state if a transition is needed if ($next_state{$state, $tok}) { $new_state = $next_state{$state, $tok}; } else { $new_state = $state; # default - no change } if (&opening($state, $tok)) { # --- 'opening' tokens # special case warning: if ($opt_w && $state eq 'comment' && $tok eq '/*') { print STDERR "$0: warning: $file, $line: open-comment in comment\n"; } $state = $new_state; *stack = $state; $realstack = $state; if ($tok =~ /^#/) { push(@cpp, ($line, $tok)); $realstack = 'cpp'; } else { push(@stack, ($line, $tok)); } if ($opt_d) { print STDERR "$0: line $line: pushed $tok on $realstack stack\n"; } } elsif (&closing($state, $tok)) { # --- 'closing' tokens if ($tok =~ /^#/) { if (@cpp) { $pop = pop(@cpp); pop(@cpp); # get rid of line-no too if ($opt_d) { print STDERR "$0: line $line: popped $pop from cpp stack\n"; } } else { print STDERR "$0: $file, $line: '$tok': close without open\n"; } next; } $prev_tok = $stack[$#stack]; $prev_line = $stack[$#stack-1]; if ($pairof{$prev_tok} ne $tok) { if ($state eq 'string' && $tok eq '"' || $state eq 'char' && $tok eq "'" || $state eq 'comment' && $tok eq '*/') { shift(@stack); shift(@stack); # get rid of opening &cleanup_pop(*stack, $state); $state = $new_state; *stack = $state; next; } elsif ($state eq 'code') { print STDERR "$0: $file, $line: '$tok': close without open\n"; @string = (); @char = (); @comment = (); } else { # not in code: comment, string or char $prefix = "warning (in $state): "; if ($opt_w) { print STDERR "$0: $prefix$file, $line: '$tok': close without open\n"; } if ($state eq 'string' && $tok eq '"') { @string = (); } elsif ($state eq 'comment' && $tok eq '*/') { @comment = (); } elsif ($state eq 'char' && $tok eq "'") { @char = (); } } } else { # match found: pop matching token (and line) from stack pop(@stack); pop(@stack); if ($opt_d) { print STDERR "$0: line $line: popped $tok from $state stack\n"; } $state = $new_state; *stack = $state; } # found matching open to close token } # open or close token elsif ($tok =~ /#els/) { unless (@cpp) { print STDERR "$0: $file, $line: '#if' less '$tok'\n"; } } } # while tokens in input $at_eof = 1; $eof_header = 0; for $st ('code', 'comment', 'string', 'char', 'cpp') { eval "*stack = *$st"; if (@stack) { unless ($eof_header) { print STDERR "$0: $file", ': EOF with unclosed tokens:',"\n"; $eof_header = 1; } &cleanup_pop(*stack, $st); } } } # # canonicalize: # Convert input file to a list of tokens while stripping all # "uninteresting" tokens from input. # # o Delete every \x to ease string and char constant parsing. # o Replace every two-chars interesting tokens (like /* and */) # by a single-char code # o Leave paired-tokens (like {}[]) and newlines (to count lines) # deleting all the rest # sub canonicalize { local($file) = @_; local($_, $tok) = ('', ''); # undef $/; # read all input into one long string # $* = 1; # Allow multi-line pattern matching @tokens = (); open(FH, $file) || die "$0: cannot open '$file' - $!\n"; while () { @Ltokens = (); s/\\[^\n]//g; # Get rid of escape-sequences to simplify string parsing # normalize to one-char tokens to simplify parsing s,/\*,\201,g; s,\*/,\202,g; s,^\s*#\s*if,\211,; s,^\s*#\s*ifdef,\212,; s,^\s*#\s*ifndef,\213,; s,^\s*#\s*else,\214,; s,^\s*#\s*elsif,\215,; s,^\s*#\s*endif,\216,; # Get rid of all chars except 'interesting' chars y/()[]{}"'\n\201-\230//cd; @Ltokens = split(//, $_); # convert line to list of tokens while ($tok = shift(@Ltokens)) {# translate back to real-tokens if ($realtok_of{$tok}) { push(@tokens, $realtok_of{$tok}); } else { push(@tokens, $tok); } } } close(FH); # @tokens; # return interesting list of tokens for whole file } # # Messy-DOS epilogue: # @REM=(qq! :end !) if 0 ; --- Peace, Ariel arielf@mirage.nsc.com