
    g%                         d Z ddlmZ ddlZddlZddlZ G d d      Z	 d
	 	 	 	 	 	 	 ddZ	 d
	 	 	 	 	 	 	 	 	 ddZddZ	d	 Z
y)zJThis is an educational implementation of the byte pair encoding algorithm.    )annotationsNc                  X    e Zd Zd	dZd
ddZddZddZddZedd       Z	ed        Z
y)SimpleBytePairEncodingc                   || _         || _        |j                         D ci c]  \  }}||
 c}}| _        t	        j
                  |      | _        yc c}}w )zCreates an Encoding object.N)pat_strmergeable_ranksitems_decoderregexcompile_pat)selfr   r   token_bytestokens        k/var/www/python.lazyprojects.co.uk/rdoDiscordBot/venv/lib/python3.12/site-packages/tiktoken/_educational.py__init__zSimpleBytePairEncoding.__init__   sT     .FUF[F[F]^F]0BU+F]^MM'*	 _s   Ac                    | j                   j                  |      }g }|D ]<  }|j                  d      }t        | j                  ||      }|j                  |       > |S )z`Encodes a string into tokens.

        >>> enc.encode("hello world")
        [388, 372]
        utf-8)	visualise)r   findallencode
bpe_encoder   extend)r   textr   wordstokensword
word_bytesword_tokenss           r   r   zSimpleBytePairEncoding.encode   s]     		!!$'DW-J$T%9%9:QZ[KMM+&	 
     c                8     dj                   fd|D              S )znDecodes a list of tokens into bytes.

        >>> enc.decode_bytes([388, 372])
        b'hello world'
        r    c              3  <   K   | ]  }j                   |     y wNr
   ).0r   r   s     r   	<genexpr>z6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>-   s     A&e,&s   )joinr   r   s   ` r   decode_bytesz#SimpleBytePairEncoding.decode_bytes'   s     xxA&AAAr    c                F    | j                  |      j                  dd      S )u   Decodes a list of tokens into a string.

        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
        the invalid bytes with the replacement character "�".

        >>> enc.decode([388, 372])
        'hello world'
        r   replaceerrors)r)   decoder(   s     r   r.   zSimpleBytePairEncoding.decode/   s%       (//	/JJr    c                F    |D cg c]  }| j                   |    c}S c c}w )zDecodes a list of tokens into a list of bytes.

        Useful for visualising how a string is tokenised.

        >>> enc.decode_tokens_bytes([388, 372])
        [b'hello', b' world']
        r$   )r   r   r   s      r   decode_tokens_bytesz*SimpleBytePairEncoding.decode_tokens_bytes:   s%     399&e$&999s   c                8    t        | ||      }t        ||      S )z#Train a BPE tokeniser on some data!)data
vocab_sizer   r   r   )	bpe_trainr   )training_datar3   r   r   s       r   trainzSimpleBytePairEncoding.trainD   s      $:W^_%gWWr    c                    t        | t              rt        j                  |       } t	        | j
                  | j                        S )Nr4   )
isinstancestrtiktokenget_encodingr   _pat_str_mergeable_ranks)encodings    r   from_tiktokenz$SimpleBytePairEncoding.from_tiktokenJ   s:    h$,,X6H%%%x7P7P
 	
r    N)r   r:   r   dict[bytes, int]returnNonecolour)r   r:   r   
str | NonerB   	list[int])r   rG   rB   bytes)r   rG   rB   r:   )r   rG   rB   list[bytes])r6   r:   r3   intr   r:   )__name__
__module____qualname__r   r   r)   r.   r0   staticmethodr7   r@    r    r   r   r      sG    + B	K: X X
 
 
r    r   c                   |D cg c]  }t        |g       }}	 |r |dv rt        |       n|dk(  rt        |       d }d }t        t	        |d d |dd              D ].  \  }}| j                  |d   |d   z         }	|	#||	|k  s+|}|	}0 |n#|J |d | ||   ||dz      z   gz   ||dz   d  z   }|r
t                |D 
cg c]  }
| |
   	 }}
|S c c}w c c}
w )NrE   colorsimple   r      )rH   visualise_tokensprint	enumeratezipget)r   inputr   bpartsmin_idxmin_rankipairrankpartr   s               r   r   r   S   s7    "''AUA3ZE'
// 'h&e  U3BZqr!;<GAt"&&tAwa'89DX%5	 = """ hw5>E'A+4F#F"GG%PWZ[P[P]J^^/ 2 056od#F6M= (: 7s   CCc           
        |dk  rt        d      i }t        d      D ]  }||t        |g      <    t        j                  ||       D cg c]+  }|j                  d      D cg c]  }t        |g       c}- }}}t        |      |k  rt        j                         |D ]&  }	t        |	d d |	dd        D ]  }
|
xx   dz  cc<    ( t        fd      }|d   |d   z   }t        |      }|||<   g }|D ]  }g }d}|t        |      dz
  k  rR||   ||dz      f|k(  r|j                  |       |d	z  }n|j                  ||          |dz  }|t        |      dz
  k  rR|t        |      dz
  k(  r|j                  ||          |j                  |        |}|rt        d
|d    d|d           t        d| dt        |       d       |dv r1t        d       t        |d d D cg c]  }|D ]  }|  c}}       n%|dk(  r t        d       |d d D ]  }t        |        t        d       t        |      |k  r|S c c}w c c}}w c c}}w )N   z;vocab_size must be at least 256, so we can encode all bytesr   rT   rU   c                    |    S r#   rO   )xstatss    r   <lambda>zbpe_train.<locals>.<lambda>   s	    E!Hr    )keyr   rV   z The current most common pair is z + zSo we made z our zth tokenrQ   z9Now the first fifty words in our training data look like:2   rS   z:Now the first twenty words in our training data look like:   
)
ValueErrorrangerH   r   r   r   lencollectionsCounterrZ   maxappendrX   rW   )r2   r3   r   r   ranksra   r   r]   r   piecerb   most_common_pairr   r   	new_wordsnew_wordri   s                   @r   r5   r5   w   s    DVWWE4[eQCj  @E}}WVZ?[ ?[tT[[121s12?[ 
  
 e*z
!##%EE#2Jab	2dq  3  u*<=&q),<Q,??E
"k 	DHAc$i!m#GT!a%[)-==OOK0FAOODG,FA c$i!m# CIM!Q(X&   45Ea5H4IM]^_M`LabcK}E#e*XFG//QR E#2J!QJDD5%D%J!QRh&RS!#2JD$K '$KW e*z
!Z Lc 	3 V "Rs   I$I7I0I
Ic                L   dD cg c]  }d| d
 }}| D cg c]  }|j                  dd       }}d}d }|D ]S  }||t        |      z     }||k(  r||dz   t        |      z     }||k7  sJ |}|t        |      z  }t        ||z   d	
       U t        d       y c c}w c c}w )N)         M   P   D      z[48;5;mr   r+   r,   r   rU    )endz[0m)r.   rq   rX   )	token_valuesra   
backgroundrh   unicode_token_valuesrunning_length
last_colorr   rR   s	            r   rW   rW      s    /OP/O!L1%/OJP JVVAAHHWYH?VNJ%>C
O;<J 2c*oEFEJ&&&
#e*$eem$ & 
+! Q Ws
   BB!c                 b   d} t        t              5 }|j                         }d d d        t        j	                  d|       }t        d       |j                  d      }|j                  |      dk(  sJ |j                  |      dk(  sJ |j                  |      ddgk(  sJ |S # 1 sw Y   xY w)	NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+iX  )r3   r   zJThis is the sequence of merges performed in order to encode 'hello world':zhello worlds   hello worlds   hellos    world)
open__file__readr   r7   rX   r   r.   r)   r0   )gpt2_patternfr2   encr   s        r   train_simple_encodingr      s    ]  
h1vvx 
 !
&
&t\
&
RC	
VWZZ&F::f...F#~555""6*x.CCCCJ 
s   B%%B.rD   )r   rA   r\   rH   r   rF   rB   rG   )
r2   r:   r3   rJ   r   r:   r   rF   rB   rA   )r   rI   rB   rC   )__doc__
__future__r   rr   r   r;   r   r   r5   rW   r   rO   r    r   <module>r      s    P "   D
 D
P NV!%!.3!@J!!J GOB
BB),B9CBBJ(r    