[prev in list] [next in list] [prev in thread] [next in thread] 

List:       linux-ha-dev
Subject:    Re: [Linux-ha-dev] Cannot send T_STARTING
From:       Alan Robertson <alanr () unix ! sh>
Date:       2003-06-18 20:02:07
[Download RAW message or body]

Ozan Eren Bilgen wrote:
> Hi,
> 
> The code segment and necessary definitions are attached below. Maybe I did 
> something wrong, because this error realises every time.
> 
> gdb says nothing more then he says in he first line "new thread (a 
> number) ..." which is probably not interested with this failure.
> 
> ltrace is more then 300K. So I didn't attach it to respect your inbox. 
> If you want to see, please visit:
> 
> 	http://gsu.linux.org.tr/~mavi/2ncm/ltrace_2003-06-18

Thanks!  I excerpted the relevant part below.

> 
> The SIGINT in the ltrace is my ctrl+c in order to say to my program to 
> terminate its job.
> 
> How to enable debug?

You can enable debug from heartbeat with  -d.  If you give it -ddddddd then 
it will be EXTREMELY verbose.  You will know almost everything that it does ;-)

Here's the bug:


    if (hac->we->llc_ops->sendnodemsg (hac->we, reply, hac->they->nodename) 
== HA_OK)
 >       {
 >         cl_log (LOG_ERR, "Cannot send message to [%s]", hac->they->nodename);
 >         cl_log (LOG_ERR, "REASON: %s", hac->we->llc_ops->errmsg (hac->we));
 >         ha_log_message (reply);



The bug is that you had an == HA_OK which should have been != HA_OK.  This 
is why including the source code is essential.  Sometimes it's just that it 
is necessary to get other eyeballs on the source to see what the author 
can't see...


> Ozan Eren Bilgen
> 
> 
> 
> ------------------------------------------------------------------------
> 
> #include <stdio.h>
> #include <string.h>
> #include <heartbeat/ha_msg.h>
> #include <heartbeat/heartbeat.h>
> #include <heartbeat/hb_api.h>
> #include <clplumbing/cl_log.h>
> #include <clplumbing/cl_signal.h>
> #include <malloc.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <error.h>
> #include <errno.h>
> #include <time.h>
> 
> #include "2ncm.h"
> #include "rscops.h"
> #include "config.h"
> #include "callback.h"
> 
> 
> int
> main (int argc, char **argv)
> {
> struct ha_cluster *hac;
> 
> struct ha_msg *reply;
> 
> ...
> 
> cl_log_enable_stderr (TRUE);
> cl_log_set_entity (argv[0]);
> cl_log_set_facility (LOG_DAEMON);
> 
> init (&hac);
> 
> gethostname (hac->config->host, MAXHOSTLENGTH);
> parse_hacf (hac, "/etc/ha.d/ha.cf");
> cl_log(LOG_INFO, "/etc/ha.d/ha.cf parsed.");
> parse_haresources (hac, "/etc/ha.d/haresources");
> cl_log(LOG_INFO, "/etc/ha.d/haresources parsed.");
> 
> if ((hac->we = ll_cluster_new ("heartbeat")) == NULL)
> {
> cl_log (LOG_ERR, "Cannot create a heartbeat instance.");
> disaster (hac, NO, JUST_FREE);
> }
> else
> {
> cl_log (LOG_INFO, "Returned a handle to the heartbeat library instance. (PID=%ld)", \
> (long) getpid ()); }
> 
> if (hac->we->llc_ops->signon (hac->we, "2NCM") != HA_OK)
> {
> cl_log (LOG_ERR, "Cannot sign on to heartbeat.");
> 	cl_log (LOG_ERR, "REASON: %s", hac->we->llc_ops->errmsg (hac->we));
> disaster (hac, NO, DELETE);
> }
> 
> cl_log (LOG_INFO, "Signed on as \"2NCM\".");
> #if 0
> if (hac->we->llc_ops->setfmode (hac->we, LLC_FILTER_PMODE) == HA_OK)
> {
> 	cl_log(LOG_INFO, "PMODE filter established.");
> }
> else
> {
> 	cl_log(LOG_ERR, "Cannot filter with PMODE!");
> }
> #endif
> init_nodewalk(hac);
> cl_log (LOG_INFO, "Nodewalk ended.");
> 
> init_callback(hac);
> cl_log (LOG_INFO, "Callback fonctions add completed.");
> 
> CL_SIGINTERRUPT (SIGINT, 1);
> CL_SIGNAL (SIGINT, gotsig);
> 
> cl_log (LOG_INFO, "SIGINT hook done.");
> 
> if (hac->we->llc_ops->setmsgsignal (hac->we, 0) == HA_OK)
> {
> 	cl_log (LOG_INFO, "setmsgsignal() is set to 0.");
> }
> else
> {
> 	cl_log (LOG_INFO, "Cannot set setmsgsignal() to 0.");
> };
> 
> if (hac->config->our_active_links == 0)
> {
> /* OPPPSS! */
> cl_log (LOG_ERR, "2NCM started but we don't have any links UP!");
> }
> else
> {
> if ((hac->they->active_links > 0) && (hac->config->auto_failback == ON))
> {
> cl_log (LOG_INFO, "According to auto_failback, trying to send [%s] to [%s].", \
> T_STARTING, hac->they->nodename); /* We send a starting message and want to hear a \
> T_RESOURCES on callback */ reply = ha_msg_new (0);
> ha_msg_add (reply, F_TYPE, T_STARTING);
> if (hac->we->llc_ops->sendnodemsg (hac->we, reply, hac->they->nodename) == HA_OK)
> {
> cl_log (LOG_ERR, "Cannot send message to [%s]", hac->they->nodename);
> cl_log (LOG_ERR, "REASON: %s", hac->we->llc_ops->errmsg (hac->we));
> ha_log_message (reply);
> }
> else
> {
> cl_log(LOG_INFO, "[%s] message was sent. Waiting [%s] on the callback.", \
> T_STARTING, T_RESOURCES); }
> ha_msg_del (reply);
> reply = NULL;
> }
> else
> {
> 	  cl_log (LOG_INFO, "[%s] has no active links, T_STARTING was not send.", \
> hac->they->nodename); }
> }
> 
> cl_log (LOG_INFO, "Waiting for messages...");
> errno = 0;
> 
> ...
> 
> }
> 
> /*
> * 2ncm.h - Internal definitions used entirely in 2NCM
> *
> * ...
> */
> 
> #ifndef __2NCM_H__
> #define __2NCM_H__ 1
> 
> #include <heartbeat/heartbeat.h>
> #include <heartbeat/ha_msg.h>
> #include <heartbeat/hb_api.h>
> 
> #include "rscops.h"
> #include "config.h"
> #include "callback.h"
> 
> /* ... */
> 
> enum boolean { NO, YES };
> enum auto_failback_status { ON, OFF, LEGACY };
> enum resource_status { RELEASED, ABOUTTOBERELEASED, ACTIVATED, ABOUTTOBEACTIVATED \
> }; enum resource_level { ANY, LOCAL, FOREIGN, ALL };
> enum node_status { DEAD, UP, ACTIVE };
> 
> /* ... */
> 
> #define MAXHOSTLENGTH	200
> 
> /* ... */
> 
> struct ha_cluster
> {
> struct ll_cluster *we;
> struct othernode *they;
> struct cluster_configuration *config;
> struct resource_flags *flags;
> struct resource_operations *rscops;
> struct ha_service_group **ha_service_groups;
> };
> 
> struct othernode
> {
> char *nodename;
> enum resource_level hold_resources;
> enum node_status status;
> enum boolean need_stonith;
> int active_links;
> };
> 
> struct cluster_configuration
> {
> int our_active_links;
> enum auto_failback_status auto_failback;	/* ON | OFF | LEGACY */
> char *host;
> time_t configuration_time_haresources;
> time_t configuration_time_hacf;
> time_t started;
> int ha_service_groups_count;	/* =~ the number of useful lines in \
> /etc/ha.d/haresources */ };
> 
> struct resource_flags
> {
> enum job_status resource_management;
> enum resource_level we_hold_resources;
> enum node_status status;
> };
> 
> /* ... */
> 
> #endif /* __2NCM_H__ */
> 
> 
> ------------------------------------------------------------------------
> 
> lt-heartbeat: 2003/06/18_16:03:09 info: Logging defaulting to /var/log/ha-log
> lt-heartbeat: 2003/06/18_16:03:09 info: **************************
> lt-heartbeat: 2003/06/18_16:03:09 info: Configuration validated. Starting heartbeat \
>                 1.0.2
> lt-heartbeat: 2003/06/18_16:03:09 info: lt-heartbeat: version 1.0.2
> lt-heartbeat: 2003/06/18_16:03:09 info: Heartbeat generation: 49
> lt-heartbeat: 2003/06/18_16:03:09 info: Starting serial heartbeat on tty /dev/ttyS0 \
>                 (19200 baud)
> lt-heartbeat: 2003/06/18_16:03:09 info: UDP Broadcast heartbeat started on port 694 \
>                 (694) interface eth0
> lt-heartbeat: 2003/06/18_16:03:09 info: pid 1115 locked in memory.
> lt-heartbeat: 2003/06/18_16:03:09 info: pid 1116 locked in memory.
> lt-heartbeat: 2003/06/18_16:03:09 info: pid 1117 locked in memory.
> lt-heartbeat: 2003/06/18_16:03:09 info: pid 1118 locked in memory.
> lt-heartbeat: 2003/06/18_16:03:09 info: pid 1119 locked in memory.
> lt-heartbeat: 2003/06/18_16:03:09 info: Local status now set to: 'up'
> lt-heartbeat: 2003/06/18_16:03:09 info: pid 1113 locked in memory.
> lt-heartbeat: 2003/06/18_16:03:11 info: Link spiderman:eth0 up.
> lt-heartbeat: 2003/06/18_16:03:11 info: Local status now set to: 'active'
> lt-heartbeat: 2003/06/18_16:03:11 info: Link mcfly:eth0 up.
> lt-heartbeat: 2003/06/18_16:03:11 info: Status update for node mcfly: status active
> lt-heartbeat: 2003/06/18_16:04:42 info: EOF from client pid 1121

Here's the relevant part of the ltrace output...  I expected to see more 
cross-library linkages - but calling through the function pointers seems to 
defeat that :-(



cl_log(6, 0x0804dd00, 1194, 0x4008700e, 0x4018eee0/home/mavi/2ncm/2NCM: 
info: Callback fonctions add completed.
) = 0
siginterrupt(2, 1, 1194, 0x4008700e, 0x4018eee0)  = 0
cl_signal_set_simple_handler(2, 0x08048d70, 0, 0x4008700e, 0x4018eee0) = 0
cl_log(6, 0x0804dd22, 0, 0x4008700e, 0x4018eee0/home/mavi/2ncm/2NCM: info: 
SIGINT hook done.
)  = 0
cl_log(6, 0x0804dd34, 0, 0x4008700e, 0x4018eee0/home/mavi/2ncm/2NCM: info: 
setmsgsignal() is set to 0.
)  = 0
cl_log(6, 0x0804ddc0, 0x0804db87, 0x08052fd8, 
0x4018eee0/home/mavi/2ncm/2NCM: info: According to auto_failback, trying to 
send [starting] to [mcfly].
) = 0
ha_msg_new(0, 0x0804ddc0, 0x0804db87, 0x08052fd8, 0x4018eee0) = 0x08054cc0
ha_msg_add(0x08054cc0, 0x0804ddf9, 0x0804db87, 0x08052fd8, 0x4018eee0) = 1
cl_log(3, 0x0804ddfb, 0x08052fd8, 0x08052fd8, 
0x4018eee0/home/mavi/2ncm/2NCM: ERROR: Cannot send message to [mcfly]
) = 0
cl_log(3, 0x0804d980, 0x40035040, 0x08052fd8, 
0x4018eee0/home/mavi/2ncm/2NCM: ERROR: REASON:
) = 0
ha_log_message(0x08054cc0, 0x0804d980, 0x40035040, 0x08052fd8, 
0x4018eee0/home/mavi/2ncm/2NCM: info: MSG: Dumping message with 2 fields
/home/mavi/2ncm/2NCM: info: MSG[0]: [t=starting]
/home/mavi/2ncm/2NCM: info: MSG[1]: [dest=mcfly]
) = 2
ha_msg_del(0x08054cc0, 0x0804d980, 0x40035040, 0x08052fd8, 0x4018eee0) = 0
cl_log(6, 0x0804de93, 0x40035040, 0x08052fd8, 
0x4018eee0/home/mavi/2ncm/2NCM: info: Waiting for messages...
) = 0
__errno_location()                                = 0x4018ece0
ha_msg_value(0x08055de8, 0x0804ddf9, 0xbffffae8, 0x08049d51, 0x4018eee0) = 
0x08055e08
ha_msg_value(0x08055de8, 0x0804dead, 0xbffffae8, 0x08049d51, 0x4018eee0) = 
0x080546a8
cl_log(5, 0x0804dec0, 1, 0x08055e08, 0x080546a8/home/mavi/2ncm/2NCM: notice: 
Got message 1 of type [hbapi-clstat] from [spiderman]
)  = 0
ha_msg_del(0x08055de8, 0x0804dead, 0xbffffae8, 0x08049d51, 0x4018eee0) = 0
--- SIGINT (Interrupt) ---
cl_log(6, 0x0804d89a, 0, 0, 0/home/mavi/2ncm/2NCM: info: Recieved leave signal.
)                    = 0
breakpointed at 0x4012d767 (?)
cl_log(6, 0x0804d98b, 0xbffffaa8, 0x40031092, 
0x08055de8/home/mavi/2ncm/2NCM: info: Signed off from HeartBeat.
) = 0
cl_log(6, 0x0804d9c0, 0xbffffaa8, 0x40031092, 
0x08055de8/home/mavi/2ncm/2NCM: info: API Object is deleted.
) = 0
cl_log(5, 0x0804d9d7, 0xbffffaa8, 0x40031092, 
0x08055de8/home/mavi/2ncm/2NCM: notice: Memory freed.
) = 0
cl_log(6, 0x0804df00, 0xbffffae8, 0x08049d51, 
0x4018eee0/home/mavi/2ncm/2NCM: info: Exiting cleanly. (We'll be respawned)
-- 
     Alan Robertson <alanr@unix.sh>

"Openness is the foundation and preservative of friendship...  Let me claim 
from you at all times your undisguised opinions." - William Wilberforce

_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.community.tummy.com
http://lists.community.tummy.com/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic