2
\$\begingroup\$

In trying to understand and use the Parser Combinator concept, I've coded up as close an analogue as I could manage within the constraints of the C language.

C doesn't have first-class functions or dynamic code generation, so instead I use a pseudo-object-oriented approach which builds a tree to represent the parser, with function-pointers at each node.

A Parser returns a list of results, so this is implemented with singly-linked lists. But trees and lists are not the focus of this code so the program just uses them without much fuss. Perhaps some operations should be abstracted, like searching to the end of the list.

This program uses two typedefd pointers. Not trying to hide it or anything, just factoring out lots of extra *s.

What improvements can I make to the overall organization of the program? Or any parts that could be more clear?

pc3.c:

#include <stdio.h> #include <stdlib.h> typedef struct parser *parser; typedef struct result *result; typedef result (*handler)( parser p, char *input ); struct parser { handler test; parser a, b; int c; }; struct result { result next; int length_matched; }; #define new_function_for_(type) \ type new_##type ( struct type input ){ \ type local = calloc( sizeof *local, 1 ); \ return local ? ( *local = input ), local : local; \ } new_function_for_(parser) new_function_for_(result) result parse( parser p, char *input ){ return p->test( p, input ); } result parse_fail( parser p, char *input ){ return NULL; } result parse_succeed( parser p, char *input ){ return new_result( (struct result){ .next = NULL, .length_matched = 0 } ); } result parse_term( parser p, char *input ){ return *input == p->c ? new_result( (struct result){ .next = NULL, .length_matched = 1 } ) : NULL; } result parse_alternate( parser p, char *input ){ result res = parse( p->a, input ); if( res ){ result r = res; while( r->next ) r = r->next; r->next = parse( p->b, input ); return res; } else return parse( p->b, input ); } void amend_lengths( result r, int length ){ r->length_matched += length; if( r->next ) amend_lengths( r->next, length ); } result parse_sequence_next( result r, parser p, char *input ){ if( ! r ) return NULL; result res = parse( p->b, input + r->length_matched ); if( res ){ amend_lengths( res, r->length_matched ); result tail = res; while( tail->next ) tail = tail->next; tail->next = parse_sequence_next( r->next, p, input ); return res; } else return parse_sequence_next( r->next, p, input ); } result parse_sequence( parser p, char *input ){ result res = parse( p->a, input ); return parse_sequence_next( res, p, input ); } parser fails(){ return new_parser( (struct parser){ .test = parse_fail } ); } parser succeeds(){ return new_parser( (struct parser){ .test = parse_succeed } ); } parser term( int c ){ return new_parser( (struct parser){ .test = parse_term, .c = c } ); } parser alternate( parser a, parser b){ return new_parser( (struct parser){ .test = parse_alternate, .a = a, .b = b } ); } parser sequence( parser a, parser b){ return new_parser( (struct parser){ .test = parse_sequence, .a = a, .b = b } ); } parser many( parser a ){ parser q = new_parser( (struct parser){ .test = parse_sequence, .a = a } ); parser r = new_parser( (struct parser){ .test = parse_alternate, .a = q, .b = succeeds() } ); q->b = r; return r; } parser some( parser a ){ parser q = new_parser( (struct parser){ .test = parse_sequence, .a = a } ); parser r = new_parser( (struct parser){ .test = parse_alternate, .a = q, .b = succeeds() } ); q->b = r; return q; } parser char_class( char *str ){ return *str ? alternate( term( *str ), char_class( str + 1 ) ) : fails(); } parser string( char *str ){ return *str ? sequence( term( *str ), string( str + 1 ) ) : succeeds(); } void print_results( result res ){ if( res ) printf( "%d ", res->length_matched ), print_results( res->next ); } void test( parser p, char *str ){ printf( "%s:", str ); fflush(0); result r = parse( p, str ); print_results( r ); printf( "\n" ); } void cases( void (*test)( parser p, char *str ) ){ parser p = string("hello"); test( p, "hello" ); test( p, "hell" ); test( p, "hello w" ); test( p, "xxxxx" ); parser q = many( term( 'x' ) ); test( q, "xy" ); test( q, "xx" ); test( q, "xxxy" ); test( q, "xxxxy" ); test( q, "xxxxx" ); parser r = sequence( many( term( 'x' ) ), string( "xy" ) ); test( r, "xy" ); test( r, "xx" ); test( r, "xxxy" ); test( r, "xxxxy" ); test( r, "xxxxx" ); } int main(){ cases( test ); return 0; } 

Output:

$ ./pc3 hello:5 hell: hello w:5 xxxxx: xy:1 0 xx:2 1 0 xxxy:3 2 1 0 xxxxy:4 3 2 1 0 xxxxx:5 4 3 2 1 0 xy:2 xx: xxxy:4 xxxxy:5 xxxxx: 

Earlier drafts and discussions in this thread. This program is partly based on a PostScript prototype discussed in this thread.

\$\endgroup\$
0

    1 Answer 1

    3
    \$\begingroup\$

    Or any parts that could be more clear?

    typedef struct parser *parser; 

    Although parser in struct parser and typedef struct parser *parser exists in different name spaces, having one as a struct and the other a pointer is far from clear. Suggest:

    typedef struct parser *parser_ptr; 

    ... or the equivalent.

    Type def-ing a struct pointer should be avoided. When needed use different names if the typedef is a pointer. The following looks wrong.

    parser q = new_parser(... ... q->b = r; // Should not code be a q.b, oh wait, `parser` is a pointer type. // Looks like a block of data is being returned, no wait, its a pointer. result parse(...) { ... } 

    Ref: Is it a good idea to typedef pointers?


    Code lack comments. A few comments, at least, would add clarity.


    Put test cases and main() in another files to clearly demarcate the code and the test code.


    No need to flush everything:

    printf( "%s:", str ); //fflush(0); fflush(stdout); 

    Subtle point. When int is used to store a character and subsequent code interacts with char, the standard C library model is to treat the values as unsigned char. This is important when char is signed and values outside 0-127 are used. Recall fgets() returns values in the unsigned char, EOF range.

    result parse_term( parser p, char *input ){ // return *input == p->c // return (unsigned char) *input == (unsigned char) (p->c) 

    There is testing for a null pointer in many places except for test code. Hmmm.

    \$\endgroup\$

      Start asking to get answers

      Find the answer to your question by asking.

      Ask question

      Explore related questions

      See similar questions with these tags.