
    cRG              
          d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d          Z G d	 d
e          Z G d de          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ ed          dee%         dee%         de&fd            Z' ed          	 d'd"e%d#e(d$e&de(fd%            Z)d&S )(    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   V    e Zd ZdZdedefdZdeddfdZd	dZe	de
fd            ZdS )
MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                     t           )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     7/usr/lib/python3/dist-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible$   
     "!    Nc                     t           )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r    r"   s     r$   feedzMessDetectorPlugin.feed*   s
    
 "!r'   c                     t           )zB
        Permit to reset the plugin to the initial state.
        r    r#   s    r$   resetzMessDetectorPlugin.reset1   r&   r'   c                     t           )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r    r+   s    r$   ratiozMessDetectorPlugin.ratio7   s
     "!r'   r   N)__name__
__module____qualname____doc__strboolr%   r)   r,   propertyfloatr.    r'   r$   r   r      s         
"# "$ " " " ""c "d " " " "" " " " "u " " " X" " "r'   r   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
 TooManySymbolOrPunctuationPluginr   Nc                 L    d| _         d| _        d| _        d | _        d| _        d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr+   s    r$   __init__z)TooManySymbolOrPunctuationPlugin.__init__A   s0    '("#%&37!,1###r'   r   c                 *    |                                 S Nisprintabler"   s     r$   r%   z)TooManySymbolOrPunctuationPlugin.eligibleI       $$&&&r'   c                 (   | xj         dz  c_         || j        k    ro|t          vrft          |          r| xj        dz  c_        nF|                                du r0t          |          r!t          |          du r| xj        dz  c_        || _        d S )Nr   F   )	r>   r?   r   r   r<   isdigitr   r   r=   r"   s     r$   r)   z%TooManySymbolOrPunctuationPlugin.feedL   s    " 222!===i(( (''1,'''!!##u,,i(( -	**e33""a'""$-!!!r'   c                 0    d| _         d| _        d| _        d S Nr   )r<   r>   r=   r+   s    r$   r,   z&TooManySymbolOrPunctuationPlugin.reset^   s     "# !r'   c                 ^    | j         dk    rdS | j        | j        z   | j         z  }|dk    r|ndS )Nr           g333333?)r>   r<   r=   )r#   ratio_of_punctuations     r$   r.   z&TooManySymbolOrPunctuationPlugin.ratioc   sK     A%%3 #d&88!'" (<s'B'B##Kr'   r/   r0   r1   r2   rA   r4   r5   r%   r)   r,   r6   r7   r.   r8   r'   r$   r:   r:   @   s        2 2 2 2'# '$ ' ' ' '.c .d . . . .$   
 Lu L L L XL L Lr'   r:   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
TooManyAccentuatedPluginr   Nc                 "    d| _         d| _        d S rK   r>   _accentuated_countr+   s    r$   rA   z!TooManyAccentuatedPlugin.__init__p   s    %&'(r'   r   c                 *    |                                 S rC   )isalphar"   s     r$   r%   z!TooManyAccentuatedPlugin.eligiblet   s      """r'   c                 h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S Nr   )r>   r   rT   r"   s     r$   r)   zTooManyAccentuatedPlugin.feedw   sJ    ")$$ 	)##q(####	) 	)r'   c                 "    d| _         d| _        d S rK   rS   r+   s    r$   r,   zTooManyAccentuatedPlugin.reset}   s     !"#r'   c                 d    | j         dk    s| j         dk     rdS | j        | j         z  }|dk    r|ndS )Nr      rM   gffffff?rS   )r#   ratio_of_accentuations     r$   r.   zTooManyAccentuatedPlugin.ratio   sI     A%%)>)B)B3'+'>AV'V(=(E(E$$3Nr'   r/   rO   r8   r'   r$   rQ   rQ   o   s        ) ) ) )## #$ # # # #)c )d ) ) ) )$ $ $ $ Ou O O O XO O Or'   rQ   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
UnprintablePluginr   Nc                 "    d| _         d| _        d S rK   )_unprintable_countr>   r+   s    r$   rA   zUnprintablePlugin.__init__   s    '(%&r'   r   c                     dS NTr8   r"   s     r$   r%   zUnprintablePlugin.eligible       tr'   c                 d    t          |          r| xj        dz  c_        | xj        dz  c_        d S rX   )r   r`   r>   r"   s     r$   r)   zUnprintablePlugin.feed   s@    )$$ 	)##q(##"r'   c                     d| _         d S rK   )r`   r+   s    r$   r,   zUnprintablePlugin.reset   s    "#r'   c                 @    | j         dk    rdS | j        dz  | j         z  S )Nr   rM   r[   )r>   r`   r+   s    r$   r.   zUnprintablePlugin.ratio   s+     A%%3'!+t/DDDr'   r/   rO   r8   r'   r$   r^   r^      s        ' ' ' '# $    #c #d # # # #
$ $ $ $ Eu E E E XE E Er'   r^   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuspiciousDuplicateAccentPluginr   Nc                 0    d| _         d| _        d | _        d S rK   _successive_countr>   _last_latin_characterr+   s    r$   rA   z(SuspiciousDuplicateAccentPlugin.__init__   s     &'%&48"""r'   r   c                 H    |                                 ot          |          S rC   )rV   r   r"   s     r$   r%   z(SuspiciousDuplicateAccentPlugin.eligible   s!      "":x	':'::r'   c                 l   | xj         dz  c_         | j        t          |          rt          | j                  rr|                                r)| j                                        r| xj        dz  c_        t          |          t          | j                  k    r| xj        dz  c_        || _        d S rX   )r>   rl   r   isupperrk   r   r"   s     r$   r)   z$SuspiciousDuplicateAccentPlugin.feed   s    "&2y)) 3t9:: 3   "" ,t'A'I'I'K'K ,&&!+&&Y''=9S+T+TTT&&!+&&%."""r'   c                 0    d| _         d| _        d | _        d S rK   rj   r+   s    r$   r,   z%SuspiciousDuplicateAccentPlugin.reset   s     !" !%)"""r'   c                 @    | j         dk    rdS | j        dz  | j         z  S )Nr   rM   rH   )r>   rk   r+   s    r$   r.   z%SuspiciousDuplicateAccentPlugin.ratio   s+     A%%3&*d.CCCr'   r/   rO   r8   r'   r$   rh   rh      s        9 9 9 9;# ;$ ; ; ; ;/c /d / / / /* * * *
 Du D D D XD D Dr'   rh   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuspiciousRanger   Nc                 0    d| _         d| _        d | _        d S rK   )"_suspicious_successive_range_countr>   _last_printable_seenr+   s    r$   rA   zSuspiciousRange.__init__   s     78/%&37!!!r'   r   c                 *    |                                 S rC   rD   r"   s     r$   r%   zSuspiciousRange.eligible   rF   r'   c                 D   | xj         dz  c_         |                                st          |          s	|t          v r	d | _        d S | j        	|| _        d S t          | j                  }t          |          }t          ||          r| xj        dz  c_        || _        d S rX   )r>   isspacer   r   rv   r    is_suspiciously_successive_rangeru   )r#   r   unicode_range_aunicode_range_bs       r$   r)   zSuspiciousRange.feed   s    " 	i((	 888(,D%F$,(1D%F)6t7P)Q)Q)6y)A)A+O_MM 	933q833$-!!!r'   c                 0    d| _         d| _        d | _        d S rK   )r>   ru   rv   r+   s    r$   r,   zSuspiciousRange.reset   s      !23/$(!!!r'   c                 T    | j         dk    rdS | j        dz  | j         z  }|dk     rdS |S )Nr   rM   rH   g?)r>   ru   )r#   ratio_of_suspicious_range_usages     r$   r.   zSuspiciousRange.ratio   sH     A%%3 3a7!2"' +S003..r'   r/   rO   r8   r'   r$   rs   rs      s        8 8 8 8
'# '$ ' ' ' '.c .d . . . ..) ) ) )
 /u / / / X/ / /r'   rs   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuperWeirdWordPluginr   Nc                     d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr>   _bad_character_count_buffer_buffer_accent_countr+   s    r$   rA   zSuperWeirdWordPlugin.__init__   sO     !$%() */!). %&)*!)*!!!r'   r   c                     dS rb   r8   r"   s     r$   r%   zSuperWeirdWordPlugin.eligible	  rc   r'   c                    |                                 r| xj        |z  c_        t          |          r| xj        dz  c_        | j        du r|t          |          du st          |          r\t          |          du rKt          |          du r:t          |          du r)t          |          du rt          |          du rd| _        d S | j        sd S |                                st          |          st          |          r"| j        r| xj        dz  c_        t          | j                  }| xj        |z  c_        |dk    re| j        |z  dk    rd| _        t          | j        d                   r6| j        d                                         r| xj        dz  c_        d| _        |dk    r| j        r| xj        dz  c_        d| _        | j        r9| xj        dz  c_        | xj        t          | j                  z  c_        d| _        d| _        d| _        d	| _        d S |d
vr>|                                du r*t/          |          rd| _        | xj        |z  c_        d S d S d S d S )Nr   FT   g(\?   r   r   >   _-<=>|~)rV   r   r   r   r   r   r   r   r   r   r   ry   r   r   r   lenr>   r   ro   r   r   r   rI   r   )r#   r   buffer_lengths      r$   r)   zSuperWeirdWordPlugin.feed  s    	LLI%LLi(( /))Q.))(E11i((E11^I5N5N19%%..i((E11	**e33	**e33I&&%//+/(F| 	F"	&#1)#<#<"	&@LY@W@W"	&l"	& !!$T\!2!2M!!]2!!!!,}<tCC04D- "$,r"233 5R8H8P8P8R8R 5,,1,,04D-""t'?"((A-((,0)( 2$$)$$))S->->>)),1)',D$DL()D%%%@@@!!##u,,)$$ - )-D%LLI%LLLL A@,,,,r'   c                 v    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   Fr   )r   r   r   r   r   r>   r   r   r+   s    r$   r,   zSuperWeirdWordPlugin.resetB  sG    $)!#(   !$%!#$   r'   c                 P    | j         dk    r| j        dk    rdS | j        | j        z  S )N
   r   rM   )r   r   r   r>   r+   s    r$   r.   zSuperWeirdWordPlugin.ratioL  s3    r!!d&>!&C&C3(4+@@@r'   r/   rO   r8   r'   r$   r   r      s        + + + +# $    4&c 4&d 4& 4& 4& 4&l% % % % Au A A A XA A Ar'   r   c                   ^    e Zd ZdZd
dZdedefdZdeddfdZd
dZ	e
defd	            ZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 "    d| _         d| _        d S rK   _wrong_stop_count_cjk_character_countr+   s    r$   rA   zCjkInvalidStopPlugin.__init__Z  s    &')*!!!r'   r   c                     dS rb   r8   r"   s     r$   r%   zCjkInvalidStopPlugin.eligible^  rc   r'   c                 t    |dv r| xj         dz  c_         d S t          |          r| xj        dz  c_        d S d S )N>      丄   丅r   )r   r   r   r"   s     r$   r)   zCjkInvalidStopPlugin.feeda  sZ    &&""a'""F) 	+%%*%%%%	+ 	+r'   c                 "    d| _         d| _        d S rK   r   r+   s    r$   r,   zCjkInvalidStopPlugin.reseth  s    !"$%!!!r'   c                 :    | j         dk     rdS | j        | j         z  S )N   rM   )r   r   r+   s    r$   r.   zCjkInvalidStopPlugin.ratiol  s&    $r))3%(AAAr'   r/   )r0   r1   r2   r3   rA   r4   r5   r%   r)   r,   r6   r7   r.   r8   r'   r$   r   r   T  s         
+ + + +# $    +c +d + + + +& & & & Bu B B B XB B Br'   r   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
ArchaicUpperLowerPluginr   Nc                 h    d| _         d| _        d| _        d| _        d| _        d | _        d| _        d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr>   _last_alpha_seen_current_ascii_onlyr+   s    r$   rA   z ArchaicUpperLowerPlugin.__init__t  s?    	45,23*890%&/3)-   r'   r   c                     dS rb   r8   r"   s     r$   r%   z ArchaicUpperLowerPlugin.eligible  rc   r'   c                    |                                 ot          |          }|du }|r| j        dk    rt| j        dk    r4|                                du r| j        du r| xj        | j        z  c_        d| _        d| _        d | _        d| _        | xj	        dz  c_	        d| _        d S | j        du rt          |          du rd| _        | j        |                                r| j                                        s-|                                rB| j                                        r)| j        du r| xj        dz  c_        d| _        nd| _        nd| _        | xj	        dz  c_	        | xj        dz  c_        || _        d S )NFr   @   r   TrH   )rV   r   r   rI   r   r   r   r   r   r>   r   ro   islower)r#   r   is_concerned	chunk_seps       r$   r)   zArchaicUpperLowerPlugin.feed  s    ((**J/?	/J/J E)	 	=AA4::%%''500,5588688 23D.34D0$(D!DI!!Q&!!'+D$F#t++0C0Cu0L0L',D$ ,!!## 	"(=(E(E(G(G 	"!!##	"(,(=(E(E(G(G	" 9$$66!;66 %DII $DII!	",,1,, )r'   c                 h    d| _         d| _        d| _        d| _        d | _        d| _        d| _        d S )Nr   FT)r>   r   r   r   r   r   r   r+   s    r$   r,   zArchaicUpperLowerPlugin.reset  s?     !/0,-.*340 $	#'   r'   c                 :    | j         dk    rdS | j        | j         z  S )Nr   rM   )r>   r   r+   s    r$   r.   zArchaicUpperLowerPlugin.ratio  s&     A%%37$:OOOr'   r/   rO   r8   r'   r$   r   r   s  s        . . . .# $    (*c (*d (* (* (* (*T( ( ( ( Pu P P P XP P Pr'   r      )maxsizer{   r|   r   c                    | |dS | |k    rdS d| v rd|v rdS d| v sd|v rdS d| v sd|v r
d| v sd|v rdS |                      d          |                     d          }}|D ]}|t          v r||v r dS | dv |dv }}|s|r
d	| v sd	|v rdS |r|rdS d
| v sd
|v rd	| v sd	|v rdS | dk    s|dk    rdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr
   )r{   r|   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r$   rz   rz     s    /"9t/))u/!!g&@&@uo%%)G)Gu 	?""g&@&@&&+*H*Hu)8)>)>* *S!! '   000!!!55 "
 	
	

 	33 ' 	 ,   E_$<$<u , u?""h/&A&AO##u'?'?5m++-/O/O5 	  E_$<$<333777O++}/O/O5o%%O)C)C54r'   i   皙?Fdecoded_sequencemaximum_thresholddebugc           	      Z   d t                                           D             }t          |           dz   }d}|dk     rd}n|dk    rd}nd}t          | d	z   t	          |                    D ]m\  }}|D ],}	|	                    |          r|	                    |           -|d
k    r	||z  d
k    s	||dz
  k    r!t          d |D                       }||k    r nn|rt          d          }
|
	                    t          d| d| d|            t          |           dk    rL|
	                    t          d| dd                     |
	                    t          d| dd                     |D ],}|
	                    t          |j         d|j                    -t          |d          S )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 "    g | ]} |            S r8   r8   ).0md_classs     r$   
<listcomp>zmess_ratio.<locals>.<listcomp>  s+     + + +

+ + +r'   r   rM   i       r   r      
r   c              3   $   K   | ]}|j         V  d S rC   )r.   )r   dts     r$   	<genexpr>zmess_ratio.<locals>.<genexpr>%  s$      !?!?r"(!?!?!?!?!?!?r'   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   zipranger%   r)   sumr   logr	   	__class__r.   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   s               r$   
mess_ratior     s1   + +#5#D#D#F#F+ + +I &''!+F O||13))	4,.)),/) 04 7vGG  	5! 	) 	)H  ++ )i((( AII%"CCqHHfqj  !!?!?Y!?!?!???O"333 =/00

51R5 5et5 5!25 5	
 	
 	
   2%%JJuG0@"0EGGHHHJJuG.>suu.EGGHHH 	= 	=BJJu;;;;<<<<!$$$r'   N)r   F)*	functoolsr   loggingr   typingr   r   constantr   r	   r
   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r:   rQ   r^   rh   rs   r   r   r   r4   r5   rz   r7   r   r8   r'   r$   <module>r      s               ! ! ! ! ! ! ! !         
                                   (" " " " " " " "D,L ,L ,L ,L ,L'9 ,L ,L ,L^O O O O O1 O O O4E E E E E* E E E0"D "D "D "D "D&8 "D "D "DJ1/ 1/ 1/ 1/ 1/( 1/ 1/ 1/hWA WA WA WA WA- WA WA WAtB B B B B- B B B>IP IP IP IP IP0 IP IP IPX 4Cc]C5=c]C	C C C CL 4IN4% 4%4%.34%BF4%
4% 4% 4% 4% 4% 4%r'   