/*
 * Copyright 2009-2012 10gen, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Portions Copyright 2001 Unicode, Inc.
 *
 * Disclaimer
 *
 * This source code is provided as is by Unicode, Inc. No claims are
 * made as to fitness for any particular purpose. No warranties of any
 * kind are expressed or implied. The recipient agrees to determine
 * applicability of information provided. If this file has been
 * purchased on magnetic or optical media from Unicode, Inc., the
 * sole remedy for any claim will be exchange of defective media
 * within 90 days of receipt.
 *
 * Limitations on Rights to Redistribute This Code
 *
 * Unicode, Inc. hereby grants the right to freely use the information
 * supplied in this file in the creation of products supporting the
 * Unicode Standard, and to make copies of this file in any form
 * for internal or external distribution as long as this notice
 * remains attached.
 */


#include "bson.h"
#include "encoding.h"

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
 */
static const char trailingBytesForUTF8[256] = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/* --------------------------------------------------------------------- */

/*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
 * The length can be set by:
 *  length = trailingBytesForUTF8[*source]+1;
 * and the sequence is illegal right away if there aren't that many bytes
 * available.
 * If presented with a length > 4, this returns 0.  The Unicode
 * definition of UTF-8 goes up to 4-byte sequences.
 */
static int isLegalUTF8( const unsigned char *source, int length ) {
    unsigned char a;
    const unsigned char *srcptr = source + length;
    switch ( length ) {
    default:
        return 0;
        /* Everything else falls through when "true"... */
    case 4:
        if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
    case 3:
        if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
    case 2:
        if ( ( a = ( *--srcptr ) ) > 0xBF ) return 0;
        switch ( *source ) {
            /* no fall-through in this inner switch */
        case 0xE0:
            if ( a < 0xA0 ) return 0;
            break;
        case 0xF0:
            if ( a < 0x90 ) return 0;
            break;
        case 0xF4:
            if ( a > 0x8F ) return 0;
            break;
        default:
            if ( a < 0x80 ) return 0;
        }
    case 1:
        if ( *source >= 0x80 && *source < 0xC2 ) return 0;
        if ( *source > 0xF4 ) return 0;
    }
    return 1;
}

/* If the name is part of a db ref ($ref, $db, or $id), then return true. */
static int bson_string_is_db_ref( const unsigned char *string, const int length ) {
    int result = 0;

    if( length >= 4 ) {
        if( string[1] == 'r' && string[2] == 'e' && string[3] == 'f' )
            result = 1;
    }
    else if( length >= 3 ) {
        if( string[1] == 'i' && string[2] == 'd' )
            result = 1;
        else if( string[1] == 'd' && string[2] == 'b' )
            result = 1;
    }

    return result;
}

static int bson_validate_string( bson *b, const unsigned char *string,
                                 const int length, const char check_utf8, const char check_dot,
                                 const char check_dollar ) {

    int position = 0;
    int sequence_length = 1;

    if( check_dollar && string[0] == '$' ) {
        if( !bson_string_is_db_ref( string, length ) )
            b->err |= BSON_FIELD_INIT_DOLLAR;
    }

    while ( position < length ) {
        if ( check_dot && *( string + position ) == '.' ) {
            b->err |= BSON_FIELD_HAS_DOT;
        }

        if ( check_utf8 ) {
            sequence_length = trailingBytesForUTF8[*( string + position )] + 1;
            if ( ( position + sequence_length ) > length ) {
                b->err |= BSON_NOT_UTF8;
                return BSON_ERROR;
            }
            if ( !isLegalUTF8( string + position, sequence_length ) ) {
                b->err |= BSON_NOT_UTF8;
                return BSON_ERROR;
            }
        }
        position += sequence_length;
    }

    return BSON_OK;
}


int bson_check_string( bson *b, const char *string,
                       const int length ) {

    return bson_validate_string( b, ( const unsigned char * )string, length, 1, 0, 0 );
}

int bson_check_field_name( bson *b, const char *string,
                           const int length ) {

    return bson_validate_string( b, ( const unsigned char * )string, length, 1, 1, 1 );
}
